From e19b58e672eed1489c32dd008d0ed1da6cf1af52 Mon Sep 17 00:00:00 2001 From: Heiko Bauke Date: Sat, 23 Apr 2016 00:08:51 +0200 Subject: [PATCH 001/295] alias template for matrix and array classes --- Eigen/src/Core/Array.h | 31 ++++++++++++++++++++++++++++++- Eigen/src/Core/Matrix.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 7480d1e24..af04ad3dd 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -300,8 +300,37 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE +#if __cplusplus>=201103L +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +template \ +using Array##SizeSuffix##SizeSuffix = Array; \ +/** \ingroup matrixtypedefs */ \ +template \ +using Array##SizeSuffix = Array; + +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +template \ +using Array##Size##X = Array; \ +/** \ingroup matrixtypedefs */ \ +template \ +using Array##X##Size = Array; + +EIGEN_MAKE_TYPEDEFS(2, 2) +EIGEN_MAKE_TYPEDEFS(3, 3) +EIGEN_MAKE_TYPEDEFS(4, 4) +EIGEN_MAKE_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_FIXED_TYPEDEFS(2) +EIGEN_MAKE_FIXED_TYPEDEFS(3) +EIGEN_MAKE_FIXED_TYPEDEFS(4) + +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS + +#endif // __cplusplus>=201103L + #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ using Eigen::Vector##SizeSuffix##TypeSuffix; \ diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index bcbbbf9ae..a55847739 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -456,6 +456,40 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS +#if __cplusplus>=201103L + +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +template \ +using Matrix##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +template \ +using Vector##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +template \ +using RowVector##SizeSuffix = Matrix; + +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +template \ +using Matrix##Size##X = Matrix; \ +/** \ingroup matrixtypedefs */ \ +template \ +using Matrix##X##Size = Matrix; + +EIGEN_MAKE_TYPEDEFS(2, 2) +EIGEN_MAKE_TYPEDEFS(3, 3) +EIGEN_MAKE_TYPEDEFS(4, 4) +EIGEN_MAKE_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_FIXED_TYPEDEFS(2) +EIGEN_MAKE_FIXED_TYPEDEFS(3) +EIGEN_MAKE_FIXED_TYPEDEFS(4) + +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS + +#endif // __cplusplus>=201103L + } // end namespace Eigen #endif // EIGEN_MATRIX_H From 6f5b126e6d23f1339d15b26fe87916132397d619 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Tue, 31 Jul 2018 09:33:37 +0100 Subject: [PATCH 002/295] Fix tensor contraction for AVX512 machines This patch modifies the TensorContraction class to ensure that the kc_ field is always a multiple of the packet_size, if the packet_size is > 8. Without this change spatial convolutions in Tensorflow do not work properly as the code that re-arranges the input matrices can assert if kc_ is not a multiple of the packet_size. This leads to a unit test failure, //tensorflow/python/kernel_tests:conv_ops_test, on AVX512 builds of tensorflow. --- .../Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 71fd19774..c51f3f8dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -51,6 +51,10 @@ class TensorContractionBlocking { else { computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? + kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } From 0ec8afde57f6b004dbe74116604081a191a52d56 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 20 Nov 2018 16:23:28 +0100 Subject: [PATCH 003/295] Fixed most conversion warnings in MatrixFunctions module --- .../src/MatrixFunctions/MatrixFunction.h | 4 +- .../src/MatrixFunctions/MatrixLogarithm.h | 16 ++++---- .../Eigen/src/MatrixFunctions/MatrixPower.h | 13 +++--- unsupported/test/matrix_power.cpp | 40 +++++++++---------- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 46f2720d0..cc12ab62b 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -72,10 +72,10 @@ MatrixType MatrixFunctionAtomic::compute(const MatrixType& A) MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows); MatrixType P = Ashifted; MatrixType Fincr; - for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary + for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary Fincr = m_f(avgEival, static_cast(s)) * P; F += Fincr; - P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted; + P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted; // test whether Taylor series converged const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff(); diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 79f3f957c..e917013e0 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -62,8 +62,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI))); - result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; + RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y; } } @@ -135,7 +135,8 @@ void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree const int minPadeDegree = 3; const int maxPadeDegree = 11; assert(degree >= minPadeDegree && degree <= maxPadeDegree); - + // FIXME this creates float-conversion-warnings if these are enabled. + // Either manually convert each value, or disable the warning locally const RealScalar nodes[][maxPadeDegree] = { { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L, // degree 3 0.8872983346207416885179265399782400L }, @@ -232,12 +233,13 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result) int degree; MatrixType T = A, sqrtT; - int maxPadeDegree = matrix_log_max_pade_degree::value; - const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision + const int maxPadeDegree = matrix_log_max_pade_degree::value; + const RealScalar maxNormForPade = RealScalar( + maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double - 1.1880960220216759245467951592883642e-1L; // quadruple precision + 1.1880960220216759245467951592883642e-1L); // quadruple precision while (true) { RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff(); @@ -254,7 +256,7 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result) } matrix_log_compute_pade(result, T, degree); - result *= pow(RealScalar(2), numberOfSquareRoots); + result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible } /** \ingroup MatrixFunctions_Module diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index 95f6fbca8..d7672d7c9 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -160,11 +160,11 @@ template void MatrixPowerAtomic::computePade(int degree, const MatrixType& IminusT, ResultType& res) const { int i = 2*degree; - res = (m_p-degree) / (2*i-2) * IminusT; + res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT; for (--i; i; --i) { res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView() - .solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval(); + .solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval(); } res += MatrixType::Identity(IminusT.rows(), IminusT.cols()); } @@ -194,11 +194,12 @@ void MatrixPowerAtomic::computeBig(ResultType& res) const { using std::ldexp; const int digits = std::numeric_limits::digits; - const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1L // single precision + const RealScalar maxNormForPade = RealScalar( + digits <= 24? 4.3386528e-1L // single precision : digits <= 53? 2.789358995219730e-1L // double precision : digits <= 64? 2.4471944416607995472e-1L // extended precision : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double - : 9.134603732914548552537150753385375e-2L; // quadruple precision + : 9.134603732914548552537150753385375e-2L); // quadruple precision MatrixType IminusT, sqrtT, T = m_A.template triangularView(); RealScalar normIminusT; int degree, degree2, numberOfSquareRoots = 0; @@ -296,8 +297,8 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); - ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); + RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index fa52d256e..dbaf9dbdf 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -19,7 +19,7 @@ void test2dRotation(const T& tol) MatrixPower > Apow(A); for (int i=0; i<=20; ++i) { - angle = std::pow(T(10), (i-10) / T(5.)); + angle = std::pow(T(10), T(i-10) / T(5.)); c = std::cos(angle); s = std::sin(angle); B << c, s, -s, c; @@ -61,7 +61,7 @@ void test3dRotation(const T& tol) for (int i=0; i<=20; ++i) { v = Matrix::Random(); v.normalize(); - angle = std::pow(T(10), (i-10) / T(5.)); + angle = std::pow(T(10), T(i-10) / T(5.)); VERIFY(AngleAxis(angle, v).matrix().isApprox(AngleAxis(1,v).matrix().pow(angle), tol)); } } @@ -153,52 +153,52 @@ typedef Matrix MatrixXe; EIGEN_DECLARE_TEST(matrix_power) { CALL_SUBTEST_2(test2dRotation(1e-13)); - CALL_SUBTEST_1(test2dRotation(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 + CALL_SUBTEST_1(test2dRotation(2e-5f)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 CALL_SUBTEST_9(test2dRotation(1e-13L)); CALL_SUBTEST_2(test2dHyperbolicRotation(1e-14)); - CALL_SUBTEST_1(test2dHyperbolicRotation(1e-5)); + CALL_SUBTEST_1(test2dHyperbolicRotation(1e-5f)); CALL_SUBTEST_9(test2dHyperbolicRotation(1e-14L)); CALL_SUBTEST_10(test3dRotation(1e-13)); - CALL_SUBTEST_11(test3dRotation(1e-5)); + CALL_SUBTEST_11(test3dRotation(1e-5f)); CALL_SUBTEST_12(test3dRotation(1e-13L)); CALL_SUBTEST_2(testGeneral(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testGeneral(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testGeneral(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3)); // see bug 614 + CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3f)); // see bug 614 CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testSingular(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testSingular(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testSingular(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3)); + CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3f)); CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testLogThenExp(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testLogThenExp(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3)); + CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3f)); CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L)); } From 4b2cebade8512abe05e94fd08ef901d818d8912b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 21 Nov 2018 15:53:37 +0100 Subject: [PATCH 004/295] Workaround weird MSVC bug --- Eigen/src/SVD/SVDBase.h | 4 +++- test/jacobi.cpp | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 429414797..1aede5ab0 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -180,8 +180,10 @@ public: RealScalar threshold() const { eigen_assert(m_isInitialized || m_usePrescribedThreshold); + // this temporary is needed to workaround a MSVC issue + Index diagSize = (std::max)(1,m_diagSize); return m_usePrescribedThreshold ? m_prescribedThreshold - : (std::max)(1,m_diagSize)*NumTraits::epsilon(); + : diagSize*NumTraits::epsilon(); } /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */ diff --git a/test/jacobi.cpp b/test/jacobi.cpp index 5604797f5..27b6e46d9 100644 --- a/test/jacobi.cpp +++ b/test/jacobi.cpp @@ -57,6 +57,19 @@ void jacobi(const MatrixType& m = MatrixType()) } } +namespace Foo { +class Bar {}; +bool operator<(const Bar&, const Bar&) { return true; } +} +// regression test for a very strange MSVC issue for which simply +// including SVDBase.h messes up with std::max and custom scalar type +void msvc_workaround() +{ + const Foo::Bar a; + const Foo::Bar b; + std::max EIGEN_NOT_A_MACRO (a,b); +} + EIGEN_DECLARE_TEST(jacobi) { for(int i = 0; i < g_repeat; i++) { @@ -77,4 +90,6 @@ EIGEN_DECLARE_TEST(jacobi) TEST_SET_BUT_UNUSED_VARIABLE(r); TEST_SET_BUT_UNUSED_VARIABLE(c); } + + msvc_workaround(); } From c685fe98381cb0005ff4074d8b91b70559a89b1a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 21 Nov 2018 15:59:47 +0100 Subject: [PATCH 005/295] Move regression test to right unit test file --- test/jacobi.cpp | 15 --------------- test/jacobisvd.cpp | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/test/jacobi.cpp b/test/jacobi.cpp index 27b6e46d9..5604797f5 100644 --- a/test/jacobi.cpp +++ b/test/jacobi.cpp @@ -57,19 +57,6 @@ void jacobi(const MatrixType& m = MatrixType()) } } -namespace Foo { -class Bar {}; -bool operator<(const Bar&, const Bar&) { return true; } -} -// regression test for a very strange MSVC issue for which simply -// including SVDBase.h messes up with std::max and custom scalar type -void msvc_workaround() -{ - const Foo::Bar a; - const Foo::Bar b; - std::max EIGEN_NOT_A_MACRO (a,b); -} - EIGEN_DECLARE_TEST(jacobi) { for(int i = 0; i < g_repeat; i++) { @@ -90,6 +77,4 @@ EIGEN_DECLARE_TEST(jacobi) TEST_SET_BUT_UNUSED_VARIABLE(r); TEST_SET_BUT_UNUSED_VARIABLE(c); } - - msvc_workaround(); } diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index f9a59e0e7..41fd0631f 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -69,6 +69,19 @@ void jacobisvd_method() VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m); } +namespace Foo { +class Bar {}; +bool operator<(const Bar&, const Bar&) { return true; } +} +// regression test for a very strange MSVC issue for which simply +// including SVDBase.h messes up with std::max and custom scalar type +void msvc_workaround() +{ + const Foo::Bar a; + const Foo::Bar b; + std::max EIGEN_NOT_A_MACRO (a,b); +} + EIGEN_DECLARE_TEST(jacobisvd) { CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) )); @@ -122,4 +135,6 @@ EIGEN_DECLARE_TEST(jacobisvd) CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); + + msvc_workaround(); } From a4760548793811ee1accf8de05ff791a43d54be5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 10:25:19 +0100 Subject: [PATCH 006/295] bug #1624: improve matrix-matrix product on ARM 64, 20% speedup --- .../Core/products/GeneralBlockPanelKernel.h | 53 +++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e7cab4720..b8b83c320 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { -template +template class gebp_traits; @@ -347,7 +347,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -461,8 +461,8 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch> { public: typedef std::complex LhsScalar; @@ -597,8 +597,8 @@ template struct unpacket_traits > { typede // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs,Arch> { public: typedef std::complex Scalar; @@ -746,8 +746,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs > +template +class gebp_traits, false, _ConjRhs, Arch> { public: typedef std::complex Scalar; @@ -852,7 +852,42 @@ protected: conj_helper cj; }; -/* optimized GEneral packed Block * packed Panel product kernel + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float32x2_t RhsPacket; + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = vld1_f32(b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + { + c = vfmaq_lane_f32(c, a, b, 0); + } +}; + +#endif + +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments From 806352d8443b94417b1dfe863738f0edb278b5a1 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 23 Nov 2018 12:34:27 +0000 Subject: [PATCH 007/295] Small typo found be Patrick Huber (pull request PR-547) --- doc/TutorialSlicingIndexing.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index 3b60eac6e..98ace43e4 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox @@ -2,7 +2,7 @@ namespace Eigen { /** \eigenManualPage TutorialSlicingIndexing Slicing and Indexing -This pape presents the numerous possibilities offered by `operator()` to index sub-set of rows and columns. +This page presents the numerous possibilities offered by `operator()` to index sub-set of rows and columns. This API has been introduced in %Eigen 3.4. It supports all the feature proposed by the \link TutorialBlockOperations block API \endlink, and much more. In particular, it supports \b slicing that consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix or indexed from an array of indices. From ea60a172cf1c2b82d3d9b43a57e5a8ad391f1bdf Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 23 Nov 2018 14:24:22 +0100 Subject: [PATCH 008/295] Add default constructor to Bar to make test compile again with clang-3.8 --- test/jacobisvd.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 41fd0631f..505bf57ae 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -70,7 +70,9 @@ void jacobisvd_method() } namespace Foo { -class Bar {}; +// older compiler require a default constructor for Bar +// cf: https://stackoverflow.com/questions/7411515/ +class Bar {public: Bar() {}}; bool operator<(const Bar&, const Bar&) { return true; } } // regression test for a very strange MSVC issue for which simply From a7842daef2c82a9be200dff54d455f6d4a0b199c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 15:10:28 +0100 Subject: [PATCH 009/295] Fix several uninitialized member from ctor --- Eigen/src/Eigenvalues/EigenSolver.h | 2 +- Eigen/src/Eigenvalues/RealQZ.h | 13 ++++++---- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 9 ++++--- .../IncompleteCholesky.h | 4 +-- Eigen/src/PardisoSupport/PardisoSupport.h | 1 + Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 26 +++++++++++++++++-- Eigen/src/SVD/BDCSVD.h | 2 +- Eigen/src/SVD/SVDBase.h | 4 +++ Eigen/src/SparseCholesky/SimplicialCholesky.h | 12 +++++++-- 9 files changed, 57 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index 997bebe7b..572b29e4e 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -110,7 +110,7 @@ template class EigenSolver * * \sa compute() for an example. */ - EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {} + EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {} /** \brief Default constructor with memory preallocation * diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index e2b37f40e..509130184 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -90,8 +90,9 @@ namespace Eigen { m_Z(size, size), m_workspace(size*2), m_maxIters(400), - m_isInitialized(false) - { } + m_isInitialized(false), + m_computeQZ(true) + {} /** \brief Constructor; computes real QZ decomposition of given matrices * @@ -108,9 +109,11 @@ namespace Eigen { m_Z(A.rows(),A.cols()), m_workspace(A.rows()*2), m_maxIters(400), - m_isInitialized(false) { - compute(A, B, computeQZ); - } + m_isInitialized(false), + m_computeQZ(true) + { + compute(A, B, computeQZ); + } /** \brief Returns matrix Q in the QZ decomposition. * diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index f95606206..9bbce652f 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -122,7 +122,8 @@ template class SelfAdjointEigenSolver m_eivalues(), m_subdiag(), m_info(InvalidInput), - m_isInitialized(false) + m_isInitialized(false), + m_eigenvectorsOk(false) { } /** \brief Constructor, pre-allocates memory for dynamic-size matrices. @@ -142,7 +143,8 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), - m_isInitialized(false) + m_isInitialized(false), + m_eigenvectorsOk(false) {} /** \brief Constructor; computes eigendecomposition of given matrix. @@ -166,7 +168,8 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), - m_isInitialized(false) + m_isInitialized(false), + m_eigenvectorsOk(false) { compute(matrix.derived(), options); } diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index e45c272b4..5a827c52c 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -76,12 +76,12 @@ class IncompleteCholesky : public SparseSolverBase - IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false) + IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) { compute(matrix); } diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index fb2ba04b4..70afcb3fe 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -123,6 +123,7 @@ class PardisoImpl : public SparseSolverBase }; PardisoImpl() + : m_analysisIsOk(false), m_factorizationIsOk(false), m_pt(0) { eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type"); m_iparm.setZero(); diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 1a5c5254e..013c7ae7a 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -74,13 +74,35 @@ class SPQR : public SparseSolverBase > }; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 4daa9dd21..18d7bdc0a 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -110,7 +110,7 @@ public: * The default constructor is useful in cases in which the user intends to * perform decompositions via BDCSVD::compute(const MatrixType&). */ - BDCSVD() : m_algoswap(16), m_numIters(0) + BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {} diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 1aede5ab0..851ad6836 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -245,6 +245,10 @@ protected: : m_isInitialized(false), m_isAllocated(false), m_usePrescribedThreshold(false), + m_computeFullU(false), + m_computeThinU(false), + m_computeFullV(false), + m_computeThinV(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) { diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index b9ca94bc3..1ee4fad5d 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -80,11 +80,19 @@ class SimplicialCholeskyBase : public SparseSolverBase /** Default constructor */ SimplicialCholeskyBase() - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) {} explicit SimplicialCholeskyBase(const MatrixType& matrix) - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) { derived().compute(matrix); } From 354f14293b8a193a6f01d45081db684521ac3c0e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 15:12:06 +0100 Subject: [PATCH 010/295] Fix double = bool ! --- test/umeyama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/umeyama.cpp b/test/umeyama.cpp index 1590a0a81..170c28a61 100644 --- a/test/umeyama.cpp +++ b/test/umeyama.cpp @@ -27,7 +27,7 @@ Eigen::Matrix randMatrixUnitary(int size) MatrixType Q; int max_tries = 40; - double is_unitary = false; + bool is_unitary = false; while (!is_unitary && max_tries > 0) { From 572d62697dd33acbf97e4a14e8aea06e82d6095d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 15:37:09 +0100 Subject: [PATCH 011/295] check two ctors --- test/ctorleak.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp index 7202e90dd..73904176b 100644 --- a/test/ctorleak.cpp +++ b/test/ctorleak.cpp @@ -8,7 +8,7 @@ struct Foo static Index object_limit; int dummy; - Foo() + Foo() : dummy(0) { #ifdef EIGEN_EXCEPTIONS // TODO: Is this the correct way to handle this? @@ -37,22 +37,33 @@ EIGEN_DECLARE_TEST(ctorleak) { typedef Matrix MatrixX; typedef Matrix VectorX; + Foo::object_count = 0; for(int i = 0; i < g_repeat; i++) { Index rows = internal::random(2,EIGEN_TEST_MAX_SIZE), cols = internal::random(2,EIGEN_TEST_MAX_SIZE); - Foo::object_limit = internal::random(0, rows*cols - 2); + Foo::object_limit = rows*cols; + { + MatrixX r(rows, cols); + Foo::object_limit = r.size()+internal::random(0, rows*cols - 2); std::cout << "object_limit =" << Foo::object_limit << std::endl; #ifdef EIGEN_EXCEPTIONS try { #endif - std::cout << "\nMatrixX m(" << rows << ", " << cols << ");\n"; - MatrixX m(rows, cols); + if(internal::random()) { + std::cout << "\nMatrixX m(" << rows << ", " << cols << ");\n"; + MatrixX m(rows, cols); + } + else { + std::cout << "\nMatrixX m(r);\n"; + MatrixX m(r); + } #ifdef EIGEN_EXCEPTIONS VERIFY(false); // not reached if exceptions are enabled } catch (const Foo::Fail&) { /* ignore */ } #endif + } VERIFY_IS_EQUAL(Index(0), Foo::object_count); { @@ -66,4 +77,5 @@ EIGEN_DECLARE_TEST(ctorleak) } VERIFY_IS_EQUAL(Index(0), Foo::object_count); } + std::cout << "\n"; } From ccabdd88c9bc6ab466bdbbd86ed9ecbcc2c11cb3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 16:01:47 +0100 Subject: [PATCH 012/295] Fix reserved usage of double __ in macro names --- Eigen/src/Core/Assign_MKL.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 6866095bf..c6140d185 100755 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -68,16 +68,16 @@ class vml_assign_traits #define EIGEN_PP_EXPAND(ARG) ARG #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1) -#define EIGEN_VMLMODE_EXPAND_LA , VML_HA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA #else -#define EIGEN_VMLMODE_EXPAND_LA , VML_LA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA #endif -#define EIGEN_VMLMODE_EXPAND__ +#define EIGEN_VMLMODE_EXPAND_x_ -#define EIGEN_VMLMODE_PREFIX_LA vm -#define EIGEN_VMLMODE_PREFIX__ v -#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE) +#define EIGEN_VMLMODE_PREFIX_xLA vm +#define EIGEN_VMLMODE_PREFIX_x_ v +#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE) #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \ template< typename DstXprType, typename SrcXprNested> \ @@ -89,7 +89,7 @@ class vml_assign_traits eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -97,7 +97,7 @@ class vml_assign_traits &(src.nestedExpression().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ @@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) if(vml_assign_traits::Traversal==LinearTraversal) \ { \ VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent, \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) &(src.lhs().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ From 95566eeed4793067e0f8db45f6b8a139b18d1201 Mon Sep 17 00:00:00 2001 From: Patrik Huber Date: Fri, 23 Nov 2018 22:22:14 +0000 Subject: [PATCH 013/295] Fix typos --- Eigen/src/Core/util/IndexedViewHelper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h index 40e16fdb4..1cda85060 100644 --- a/Eigen/src/Core/util/IndexedViewHelper.h +++ b/Eigen/src/Core/util/IndexedViewHelper.h @@ -23,7 +23,7 @@ struct symbolic_last_tag {}; * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). * - * This symbolic placeholder support standard arithmetic operation. + * This symbolic placeholder supports standard arithmetic operations. * * A typical usage example would be: * \code @@ -44,7 +44,7 @@ static const symbolic::SymbolExpr last; // PLEASE u * reference the last+1 element/row/columns of the underlying vector or matrix once * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). * - * This symbolic placeholder support standard arithmetic operation. + * This symbolic placeholder supports standard arithmetic operations. * It is essentially an alias to last+fix<1>. * * \sa last From 0836a715d632d980d935cd55f4026ba4ec047cdf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 09:08:38 +0100 Subject: [PATCH 014/295] bug #1611: fix plog(0) on NEON --- Eigen/src/Core/arch/NEON/MathFunctions.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index c48c61023..d218c8851 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -91,6 +91,7 @@ Packet4f plog(const Packet4f& _x) _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + const Packet4f p4f_minus_inf = vreinterpretq_f32_s32(pset1(0xff800000)); _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000); @@ -111,7 +112,8 @@ Packet4f plog(const Packet4f& _x) _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + Packet4f iszero_mask = vreinterpretq_f32_u32(vceqq_f32(_x, vdupq_n_f32(0))); + Packet4f invalid_mask = vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(_x, vdupq_n_f32(0)))); Packet4i ux = vreinterpretq_s32_f32(x); @@ -172,7 +174,8 @@ Packet4f plog(const Packet4f& _x) tmp = vmulq_f32(e, p4f_cephes_log_q2); x = vaddq_f32(x, y); x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + x = por(x, invalid_mask); + x = por(pandnot(x,iszero_mask), pand(iszero_mask, p4f_minus_inf)); return x; } From 382279eb7f0160b1b20a0e1b95df2397277ede08 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 14:10:07 +0100 Subject: [PATCH 015/295] Extend unit test to recursively check half-packet types and non packet types --- test/packetmath.cpp | 169 +++++++++++++++++++++++++++++--------------- 1 file changed, 112 insertions(+), 57 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index babb7c20e..43c33ba94 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -10,6 +10,7 @@ #include "main.h" #include "unsupported/Eigen/SpecialFunctions" +#include #if defined __GNUC__ && __GNUC__>=6 #pragma GCC diagnostic ignored "-Wignored-attributes" @@ -22,6 +23,8 @@ const bool g_vectorize_sse = true; const bool g_vectorize_sse = false; #endif +bool g_first_pass = true; + namespace Eigen { namespace internal { template T negate(const T& x) { return -x; } @@ -109,14 +112,18 @@ struct packet_helper #define REF_MUL(a,b) ((a)*(b)) #define REF_DIV(a,b) ((a)/(b)) -template void packetmath() +template void packetmath() { using std::abs; typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; typedef typename NumTraits::Real RealScalar; + if (g_first_pass) + std::cerr << "=== Testing packet of type '" << typeid(Packet).name() + << "' and scalar type '" << typeid(Scalar).name() + << "' and size '" << PacketSize << "' ===\n" ; + const int max_size = PacketSize > 4 ? PacketSize : 4; const int size = PacketSize*max_size; EIGEN_ALIGN_MAX Scalar data1[size]; @@ -254,7 +261,7 @@ template void packetmath() ref[0] += data1[i]; VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload(data1)), refvalue) && "internal::predux"); - if(PacketSize==8 && internal::unpacket_traits::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case + if(PacketSize==8 && internal::unpacket_traits::half>::size ==4) // so far, predux_half_downto4 is only required in such a case { int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize; for (int i=0; i void packetmath() } } -template void packetmath_real() +template void packetmath_real() { using std::abs; typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4]; + EIGEN_ALIGN_MAX Scalar data1[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar data2[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar ref[PacketSize*4]; for (int i=0; i void packetmath_real() data2[i] = internal::random(-1,1) * std::pow(Scalar(10), internal::random(-6,6)); } CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh); - if(PacketTraits::HasExp && PacketTraits::size>=2) + if(PacketTraits::HasExp && PacketSize>=2) { data1[0] = std::numeric_limits::quiet_NaN(); data1[1] = std::numeric_limits::epsilon(); @@ -455,7 +461,7 @@ template void packetmath_real() CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); #endif - if(PacketTraits::HasLog && PacketTraits::size>=2) + if(PacketTraits::HasLog && PacketSize>=2) { data1[0] = std::numeric_limits::quiet_NaN(); data1[1] = std::numeric_limits::epsilon(); @@ -497,18 +503,17 @@ template void packetmath_real() } } -template void packetmath_notcomplex() +template void packetmath_notcomplex() { using std::abs; typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4]; - EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4]; + EIGEN_ALIGN_MAX Scalar data1[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar data2[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar ref[PacketSize*4]; - Array::Map(data1, PacketTraits::size*4).setRandom(); + Array::Map(data1, PacketSize*4).setRandom(); ref[0] = data1[0]; for (int i=0; i void packetmath_notcomplex() VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); } -template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) +template void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; internal::conj_if cj0; internal::conj_if cj1; @@ -562,11 +565,9 @@ template void test_conj_helper(Scalar VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmadd"); } -template void packetmath_complex() +template void packetmath_complex() { - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; const int size = PacketSize*4; EIGEN_ALIGN_MAX Scalar data1[PacketSize*4]; @@ -580,10 +581,10 @@ template void packetmath_complex() data2[i] = internal::random() * Scalar(1e2); } - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); - test_conj_helper (data1,data2,ref,pval); + test_conj_helper (data1,data2,ref,pval); + test_conj_helper (data1,data2,ref,pval); + test_conj_helper (data1,data2,ref,pval); + test_conj_helper (data1,data2,ref,pval); { for(int i=0;i void packetmath_complex() } } -template void packetmath_scatter_gather() +template void packetmath_scatter_gather() { - typedef internal::packet_traits PacketTraits; - typedef typename PacketTraits::type Packet; typedef typename NumTraits::Real RealScalar; - const int PacketSize = PacketTraits::size; + const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX Scalar data1[PacketSize]; RealScalar refvalue = 0; for (int i=0; i void packetmath_scatter_gather() } } + +template< + typename Scalar, + typename PacketType, + bool IsComplex = NumTraits::IsComplex, + bool IsInteger = NumTraits::IsInteger> +struct runall; + +template +struct runall { // i.e. float or double + static void run() { + packetmath(); + packetmath_scatter_gather(); + packetmath_notcomplex(); + packetmath_real(); + } +}; + +template +struct runall { // i.e. int + static void run() { + packetmath(); + packetmath_scatter_gather(); + packetmath_notcomplex(); + } +}; + +template +struct runall { // i.e. complex + static void run() { + packetmath(); + packetmath_scatter_gather(); + packetmath_complex(); + } +}; + +template< + typename Scalar, + typename PacketType = typename internal::packet_traits::type, + bool Vectorized = internal::packet_traits::Vectorizable, + bool HasHalf = !internal::is_same::half,PacketType>::value > +struct runner; + +template +struct runner +{ + static void run() { + runall::run(); + runner::half>::run(); + } +}; + +template +struct runner +{ + static void run() { + runall::run(); + runall::run(); + } +}; + +template +struct runner +{ + static void run() { + runall::run(); + } +}; + EIGEN_DECLARE_TEST(packetmath) { + g_first_pass = true; for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( packetmath() ); - CALL_SUBTEST_2( packetmath() ); - CALL_SUBTEST_3( packetmath() ); - CALL_SUBTEST_4( packetmath >() ); - CALL_SUBTEST_5( packetmath >() ); - CALL_SUBTEST_6( packetmath() ); - - CALL_SUBTEST_1( packetmath_notcomplex() ); - CALL_SUBTEST_2( packetmath_notcomplex() ); - CALL_SUBTEST_3( packetmath_notcomplex() ); - - CALL_SUBTEST_1( packetmath_real() ); - CALL_SUBTEST_2( packetmath_real() ); - - CALL_SUBTEST_4( packetmath_complex >() ); - CALL_SUBTEST_5( packetmath_complex >() ); - - CALL_SUBTEST_1( packetmath_scatter_gather() ); - CALL_SUBTEST_2( packetmath_scatter_gather() ); - CALL_SUBTEST_3( packetmath_scatter_gather() ); - CALL_SUBTEST_4( packetmath_scatter_gather >() ); - CALL_SUBTEST_5( packetmath_scatter_gather >() ); + + CALL_SUBTEST_1( runner::run() ); + CALL_SUBTEST_2( runner::run() ); + CALL_SUBTEST_3( runner::run() ); + CALL_SUBTEST_4( runner >::run() ); + CALL_SUBTEST_5( runner >::run() ); + CALL_SUBTEST_6(( packetmath::type>() )); + g_first_pass = false; } } From 5f6045077cd23e339b66b0b0a7c47c5eede752e3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 14:14:07 +0100 Subject: [PATCH 016/295] Make SSE/AVX pandnot(A,B) consistent with generic version, i.e., "A and not B" --- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 774e64981..c291b2acd 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -217,8 +217,8 @@ template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const P template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 99d55d5e9..f88e27fd2 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -351,9 +351,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } From 2c44c401146194b9a010a1e3a4bdb5118f9f46e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 14:21:24 +0100 Subject: [PATCH 017/295] First step toward a unification of packet log implementation, currently only SSE and AVX are unified. To this end, I added the following functions: pzero, pcmp_*, pfrexp, pset1frombits functions. --- Eigen/src/Core/GenericPacketMath.h | 59 ++++++++++ Eigen/src/Core/arch/AVX/MathFunctions.h | 100 +---------------- Eigen/src/Core/arch/AVX/PacketMath.h | 26 +++++ .../arch/Default/GenericPacketMathFunctions.h | 105 ++++++++++++++++++ Eigen/src/Core/arch/SSE/MathFunctions.h | 92 ++------------- Eigen/src/Core/arch/SSE/PacketMath.h | 21 +++- 6 files changed, 220 insertions(+), 183 deletions(-) create mode 100644 Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index da1350f1b..316b03f96 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,38 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (!b); } +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } + +/** \internal \returns zeros */ +template EIGEN_DEVICE_FUNC inline Packet +pzero(const Packet& a) { return pxor(a,a); } + +/** \internal \returns bits of \a or \b according to the input bit mask \a mask */ +template EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); +} + +/** \internal \returns a <= b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ + +/** \internal \returns a < b as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet +pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */ + +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ + /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet pload(const typename unpacket_traits::type* from) { return *from; } @@ -226,6 +258,10 @@ ploadu(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -597,6 +633,29 @@ pinsertlast(const Packet& a, typename unpacket_traits::type b) return pblend(mask, pset1(b), a); } +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** \internal shift the bits by n and cast the result to the initial type, i.e.: + * return float(reinterpret_cast(a) >> n) + */ +template EIGEN_DEVICE_FUNC inline Packet +pshiftright_and_cast(Packet a, int n); + +/** Default implementation of pfrexp for float. + * It is expected to be called by implementers of template<> pfrexp, + * and the above pshiftright_and_cast function must be implemented. + */ +template EIGEN_STRONG_INLINE Packet +pfrexp_float(const Packet& a, Packet& exponent) { + const Packet cst_126f = pset1(126.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_inv_mant_mask = pset1frombits(~0x7f800000u); + exponent = psub(pshiftright_and_cast(a,23), cst_126f); + return por(pand(a, cst_inv_mant_mask), cst_half); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 6af67ce2d..134facccd 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, exp, and log functions of this file are loosely derived from +/* The sin, cos, and exp functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -29,17 +29,6 @@ inline Packet8i pshiftleft(Packet8i v, int n) #endif } -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - // Sine function // Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and // evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants @@ -110,95 +99,10 @@ psin(const Packet8f& _x) { return res; } -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); - - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); + return plog_float(_x); } // Exponential function. Works by writing "x = m*log(2) + r" where diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index c291b2acd..4b5bfebdf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -121,6 +121,11 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } + template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -199,6 +204,12 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_pd(b,a); } + +template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } + template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } @@ -363,6 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } +template<> EIGEN_STRONG_INLINE Packet8f pshiftright_and_cast(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { + return pfrexp_float(a,exponent); +} + // preduxp should be ok // FIXME: why is this ok? why isn't the simply implementation working as expected? template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h new file mode 100644 index 000000000..e384b8288 --- /dev/null +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2009-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The log function of this file initially comes from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +namespace Eigen { +namespace internal { + +// Natural logarithm +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x) { + Packet x = _x; + + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + //const Packet cst_126f = pset1(126.0f); + // The smallest non denormalized float number. + const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); + const Packet cst_minus_inf = pset1frombits( 0xff800000u); + + // Polynomial coefficients. + const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); + const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); + const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); + const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); + const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); + const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); + const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); + const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); + const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); + const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); + const Packet cst_cephes_log_q1 = pset1(-2.12194440e-4f); + const Packet cst_cephes_log_q2 = pset1(0.693359375f); + + Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x)); + Packet iszero_mask = pcmp_eq(x,pzero(x)); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y2 = pmadd(y2, x, cst_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, cst_cephes_log_q1); + tmp = pmul(x2, cst_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, cst_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask)); +} + +} // end namespace internal +} // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 4af2c6cae..cba365cc5 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,13 +8,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin, cos and exp functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ #ifndef EIGEN_MATH_FUNCTIONS_SSE_H #define EIGEN_MATH_FUNCTIONS_SSE_H +#include "../Default/GenericPacketMathFunctions.h" + namespace Eigen { namespace internal { @@ -22,85 +24,7 @@ namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED @@ -266,7 +190,7 @@ Packet4f psin(const Packet4f& _x) _EIGEN_DECLARE_CONST_Packet4i(2, 2); _EIGEN_DECLARE_CONST_Packet4i(4, 4); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u); _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); @@ -482,11 +406,11 @@ Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f88e27fd2..9959005a3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -83,7 +83,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) + const Packet4f p4f_##NAME = pset1frombits(X) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -180,6 +180,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { re template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } + // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. @@ -328,6 +333,12 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } + + #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } @@ -517,6 +528,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { + return _mm_cvtepi32_ps(_mm_srli_epi32(_mm_castps_si128(a),n)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_float(a,exponent); +} + // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template<> EIGEN_STRONG_INLINE void From c24e98e6a83dbd9cb305941a144fa1f4b21c6437 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 15:02:16 +0100 Subject: [PATCH 018/295] Unify NEON's plog with generic implementation --- Eigen/src/Core/arch/NEON/MathFunctions.h | 95 +----------------------- Eigen/src/Core/arch/NEON/PacketMath.h | 15 ++++ 2 files changed, 19 insertions(+), 91 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index d218c8851..8a70b771c 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,13 +5,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The exp function of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H +#include "../Default/GenericPacketMathFunctions.h" + namespace Eigen { namespace internal { @@ -87,96 +89,7 @@ Packet4f pexp(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - const Packet4f p4f_minus_inf = vreinterpretq_f32_s32(pset1(0xff800000)); - - _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - Packet4f iszero_mask = vreinterpretq_f32_u32(vceqq_f32(_x, vdupq_n_f32(0))); - Packet4f invalid_mask = vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(_x, vdupq_n_f32(0)))); - - Packet4i ux = vreinterpretq_s32_f32(x); - - Packet4i emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, p4i_inv_mant_mask); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half)); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, p4i_0x7f); - Packet4f e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF); - Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, p4f_1); - e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask))); - x = vaddq_f32(x, tmp); - - Packet4f z = vmulq_f32(x,x); - - Packet4f y = p4f_cephes_log_p0; - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p5); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p6); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p7); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_log_p8); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, p4f_cephes_log_q1); - y = vaddq_f32(y, tmp); - - - tmp = vmulq_f32(z, p4f_half); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, p4f_cephes_log_q2); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = por(x, invalid_mask); - x = por(pandnot(x,iszero_mask), pand(iszero_mask, p4f_minus_inf)); - return x; + return plog_float(_x); } } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 010739380..72f076e50 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,6 +145,8 @@ template<> struct unpacket_traits { typedef int32_t type; enum {size=4 template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return vreinterpretq_f32_u32(vdupq_n_u32(from)); } + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { const float f[] = {0, 1, 2, 3}; @@ -249,6 +251,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcleq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcltq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } + // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { @@ -364,6 +371,14 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { + return vcvtq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a),n)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_float(a,exponent); +} + template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { float32x2_t a_lo, a_hi, sum; From c2f35b1b4763348fd0a6df2ce750a7d3d3a56d79 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 15:58:11 +0100 Subject: [PATCH 019/295] Unify Altivec/VSX's plog with generic implementation, and enable it! --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 61 ++------------------- Eigen/src/Core/arch/AltiVec/PacketMath.h | 28 +++++++++- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index c5e4bede7..8f3296253 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,13 +9,15 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin, cos, and exp functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#include "../Default/GenericPacketMathFunctions.h" + namespace Eigen { namespace internal { @@ -94,62 +96,7 @@ static Packet2ul p2ul_52 = { 52, 52 }; template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - - Packet4i emm0; - - /* isvalid_mask is 0 if x < 0 or x is NaN. */ - Packet4ui isvalid_mask = reinterpret_cast(vec_cmpge(x, p4f_ZERO)); - Packet4ui iszero_mask = reinterpret_cast(vec_cmpeq(x, p4f_ZERO)); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = vec_sr(reinterpret_cast(x), - reinterpret_cast(p4i_23)); - - /* keep only the fractional part */ - x = pand(x, p4f_inv_mant_mask); - x = por(x, p4f_half); - - emm0 = psub(emm0, p4i_0x7f); - Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = reinterpret_cast(vec_cmplt(x, p4f_cephes_SQRTHF)); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - x = vec_sel(x, p4f_minus_inf, iszero_mask); - x = vec_sel(p4f_minus_nan, x, isvalid_mask); - return x; + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 7f4e90f75..867aa8494 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -148,7 +148,7 @@ template<> struct packet_traits : default_packet_traits HasAbs = 1, HasSin = 0, HasCos = 0, - HasLog = 0, + HasLog = 1, HasExp = 1, #ifdef __VSX__ HasSqrt = 1, @@ -285,6 +285,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { Packet4i v = {from, from, from, from}; return v; } + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { + return reinterpret_cast(pset1(from)); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) @@ -414,6 +419,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { + Packet4f c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} + template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } @@ -426,6 +439,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return vec_sel(b, a, mask); +} + template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } @@ -550,6 +567,15 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { + return vec_ctf(vec_sr(reinterpret_cast(a), + reinterpret_cast(pset1(n))),0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_float(a,exponent); +} + template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; From cf8b85d5c5d1896ce1759a8c18beb56e8a71dea2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 16:36:19 +0100 Subject: [PATCH 020/295] Unify SSE and AVX implementation of pexp --- Eigen/src/Core/GenericPacketMath.h | 23 +++++++ Eigen/src/Core/arch/AVX/MathFunctions.h | 57 +--------------- Eigen/src/Core/arch/AVX/PacketMath.h | 16 +++++ .../arch/Default/GenericPacketMathFunctions.h | 65 ++++++++++++++++++- Eigen/src/Core/arch/SSE/MathFunctions.h | 62 +----------------- Eigen/src/Core/arch/SSE/PacketMath.h | 25 ++++++- 6 files changed, 126 insertions(+), 122 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 316b03f96..e8e7fa4d3 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -220,6 +220,12 @@ pandnot(const Packet& a, const Packet& b) { return a & (!b); } template EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } +/** \internal \returns a * 2^exponent + * See https://en.cppreference.com/w/cpp/numeric/math/ldexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); } + /** \internal \returns zeros */ template EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) { return pxor(a,a); } @@ -656,6 +662,23 @@ pfrexp_float(const Packet& a, Packet& exponent) { return por(pand(a, cst_inv_mant_mask), cst_half); } +/** \internal shift the bits by n and cast the result to the initial type, i.e.: + * return reinterpret_cast(int(a) >> n) + */ +template EIGEN_DEVICE_FUNC inline Packet +pcast_and_shiftleft(Packet a, int n); + +/** Default implementation of pldexp for float. + * It is expected to be called by implementers of template<> pldexp, + * and the above pcast_and_shiftleft function must be implemented. + */ +template EIGEN_STRONG_INLINE Packet +pldexp_float(Packet a, Packet exponent) { + const Packet cst_127 = pset1(127.f); + // return a * 2^exponent + return pmul(a, pcast_and_shiftleft(padd(exponent, cst_127), 23)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 134facccd..b038c7499 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -111,62 +111,7 @@ plog(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); + return pexp_float(_x); } // Hyperbolic Tangent function. diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 4b5bfebdf..770646b91 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -389,6 +389,22 @@ template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Pack return pfrexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet8f pcast_and_shiftleft(Packet8f v, int n) +{ + Packet8i vi = _mm256_cvttps_epi32(v); +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_castsi256_ps(_mm256_slli_epi32(vi, n)); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(vi, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(vi, 1), n); + return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { + return pldexp_float(a,exponent); +} + // preduxp should be ok // FIXME: why is this ok? why isn't the simply implementation working as expected? template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index e384b8288..63e21fe42 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -9,7 +9,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The log function of this file initially comes from +/* The exp and log functions of this file initially come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -25,12 +25,12 @@ namespace internal { template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet plog_float(const Packet _x) { +Packet plog_float(const Packet _x) +{ Packet x = _x; const Packet cst_1 = pset1(1.0f); const Packet cst_half = pset1(0.5f); - //const Packet cst_126f = pset1(126.0f); // The smallest non denormalized float number. const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); const Packet cst_minus_inf = pset1frombits( 0xff800000u); @@ -101,5 +101,64 @@ Packet plog_float(const Packet _x) { return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask)); } +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x) +{ + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_exp_hi = pset1( 88.3762626647950f); + const Packet cst_exp_lo = pset1(-88.3762626647949f); + + const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); + const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); + const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); + const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); + const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); + const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); + const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + + // Clamp x. + Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); + + // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is + // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating + // truncation errors. + Packet r; +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + const Packet cst_nln2 = pset1(-0.6931471805599453f); + r = pmadd(m, cst_nln2, x); +#else + const Packet cst_cephes_exp_C1 = pset1(0.693359375f); + const Packet cst_cephes_exp_C2 = pset1(-2.12194440e-4f); + r = psub(x, pmul(m, cst_cephes_exp_C1)); + r = psub(r, pmul(m, cst_cephes_exp_C2)); +#endif + + Packet r2 = pmul(r, r); + + // TODO(gonnet): Split into odd/even polynomials and try to exploit + // instruction-level parallelism. + Packet y = cst_cephes_exp_p0; + y = pmadd(y, r, cst_cephes_exp_p1); + y = pmadd(y, r, cst_cephes_exp_p2); + y = pmadd(y, r, cst_cephes_exp_p3); + y = pmadd(y, r, cst_cephes_exp_p4); + y = pmadd(y, r, cst_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, cst_1); + + // Return 2^m * exp(r). + return pmax(pldexp(y,m), _x); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index cba365cc5..acabc9f1d 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -30,67 +30,7 @@ Packet4f plog(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); + return pexp_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9959005a3..1258f349f 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -110,12 +110,12 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, - HasBlend = 1 + HasBlend = 1, + HasFloor = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRound = 1, - HasFloor = 1, HasCeil = 1 #endif }; @@ -348,6 +348,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { ret template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4i emm0 = _mm_cvttps_epi32(a); + Packet4f tmp = _mm_cvtepi32_ps(emm0); + /* if greater, substract 1 */ + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = _mm_and_ps(mask, cst_1); + return psub(tmp, mask); +} #endif template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } @@ -536,6 +547,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack return pfrexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) +{ + Packet4i vi = _mm_cvttps_epi32(v); + return _mm_castsi128_ps(_mm_slli_epi32(vi, n)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_float(a,exponent); +} + // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template<> EIGEN_STRONG_INLINE void From 5c8406babccd6148f5b63e994de42d5a28a96931 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 16:47:13 +0100 Subject: [PATCH 021/295] Unify Altivec/VSX's pexp with generic implementation --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 40 +-------------------- Eigen/src/Core/arch/AltiVec/PacketMath.h | 10 ++++++ 2 files changed, 11 insertions(+), 39 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 8f3296253..4f1dc80ad 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -102,45 +102,7 @@ Packet4f plog(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - // express exp(x) as exp(g + n*log(2)) - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = vec_cts(fx, 0); - emm0 = vec_add(emm0, p4i_0x7f); - emm0 = vec_sl(emm0, reinterpret_cast(p4i_23)); - - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), - isnumber_mask); + return pexp_float(_x); } #ifndef EIGEN_COMP_CLANG diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 867aa8494..3f6253fa2 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -576,6 +576,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack return pfrexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) +{ + Packet4i vi = vec_cts(v,0); + return reinterpret_cast(vec_sl(vi, reinterpret_cast(pset1(n)))); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_float(a,exponent); +} + template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; From 4a347a0054f9a1b78bf547753eb598088775f5a5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 22:15:44 +0100 Subject: [PATCH 022/295] Unify NEON's pexp with generic implementation --- Eigen/src/Core/arch/NEON/MathFunctions.h | 64 +----------------------- Eigen/src/Core/arch/NEON/PacketMath.h | 25 ++++++++- 2 files changed, 25 insertions(+), 64 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index 8a70b771c..f3f6c279e 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -21,69 +21,7 @@ namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - Packet4f tmp, fx; - - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - x = vminq_f32(x, p4f_exp_hi); - x = vmaxq_f32(x, p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, p4f_cephes_exp_C1); - Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, p4f_cephes_exp_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, p4f_1); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, p4i_0x7f); - mm = vshlq_n_s32(mm, 23); - Packet4f pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; + return pexp_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 72f076e50..dc432f0d2 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -108,7 +108,8 @@ template<> struct packet_traits : default_packet_traits size = 4, HasHalfPacket=0, // Packet2f intrinsics not implemented yet - HasDiv = 1, + HasDiv = 1, + HasFloor = 1, // FIXME check the Has* HasSin = 0, HasCos = 0, @@ -256,6 +257,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + /* perform a floorf */ + Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, a); + mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1)); + return vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); +} + // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { @@ -379,6 +392,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack return pfrexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) +{ + Packet4i vi = vcvtq_s32_f32(v); + return vreinterpretq_f32_s32(vshlq_n_s32(vi, n)); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_float(a,exponent); +} + template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { float32x2_t a_lo, a_hi, sum; From 502f92fa10644629926875a792d12382f22be360 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 23:12:44 +0100 Subject: [PATCH 023/295] Unify SSE and AVX pexp for double. --- Eigen/src/Core/arch/AVX/MathFunctions.h | 78 +------------------ Eigen/src/Core/arch/AVX/PacketMath.h | 14 ++++ .../arch/Default/GenericPacketMathFunctions.h | 69 ++++++++++++++++ Eigen/src/Core/arch/SSE/MathFunctions.h | 73 +---------------- Eigen/src/Core/arch/SSE/PacketMath.h | 55 +++++++++---- 5 files changed, 126 insertions(+), 163 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index b038c7499..60d6aad40 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -123,82 +123,8 @@ ptanh(const Packet8f& x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d -pexp(const Packet4d& _x) { - Packet4d x = _x; - - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - - Packet4d tmp, fx; - - // clamp x - x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = _mm256_floor_pd(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, p4d_cephes_exp_C1); - Packet4d z = pmul(fx, p4d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet4d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet4d px = p4d_cephes_exp_p0; - px = pmadd(px, x2, p4d_cephes_exp_p1); - px = pmadd(px, x2, p4d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet4d qx = p4d_cephes_exp_q0; - qx = pmadd(qx, x2, p4d_cephes_exp_q1); - qx = pmadd(qx, x2, p4d_cephes_exp_q2); - qx = pmadd(qx, x2, p4d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm256_div_pd(px, psub(qx, px)); - x = pmadd(p4d_2, x, p4d_1); - - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - __m128i emm0 = _mm256_cvtpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); +pexp(const Packet4d& x) { + return pexp_double(x); } // Functions for sqrt. diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 770646b91..a415f2f1b 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -405,6 +405,20 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, cons return pldexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { + // Build e=2^n by constructing the exponents in a 128-bit vector and + // shifting them to where they belong in double-precision values. + Packet4i cst_1023 = pset1(1023); + __m128i emm0 = _mm256_cvtpd_epi32(exponent); + emm0 = _mm_add_epi32(emm0, cst_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + return pmul(a,_mm256_castsi256_pd(e)); +} + // preduxp should be ok // FIXME: why is this ok? why isn't the simply implementation working as expected? template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 63e21fe42..e05e67703 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -160,5 +160,74 @@ Packet pexp_float(const Packet _x) return pmax(pldexp(y,m), _x); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_2 = pset1(2.0); + const Packet cst_half = pset1(0.5); + + const Packet cst_exp_hi = pset1(709.437); + const Packet cst_exp_lo = pset1(-709.436139303); + + const Packet cst_cephes_LOG2EF = pset1(1.4426950408889634073599); + const Packet cst_cephes_exp_p0 = pset1(1.26177193074810590878e-4); + const Packet cst_cephes_exp_p1 = pset1(3.02994407707441961300e-2); + const Packet cst_cephes_exp_p2 = pset1(9.99999999999999999910e-1); + const Packet cst_cephes_exp_q0 = pset1(3.00198505138664455042e-6); + const Packet cst_cephes_exp_q1 = pset1(2.52448340349684104192e-3); + const Packet cst_cephes_exp_q2 = pset1(2.27265548208155028766e-1); + const Packet cst_cephes_exp_q3 = pset1(2.00000000000000000009e0); + const Packet cst_cephes_exp_C1 = pset1(0.693145751953125); + const Packet cst_cephes_exp_C2 = pset1(1.42860682030941723212e-6); + + Packet tmp, fx; + + // clamp x + x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(cst_cephes_LOG2EF, x, cst_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = pfloor(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, cst_cephes_exp_C1); + Packet z = pmul(fx, cst_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet px = cst_cephes_exp_p0; + px = pmadd(px, x2, cst_cephes_exp_p1); + px = pmadd(px, x2, cst_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet qx = cst_cephes_exp_q0; + qx = pmadd(qx, x2, cst_cephes_exp_q1); + qx = pmadd(qx, x2, cst_cephes_exp_q2); + qx = pmadd(qx, x2, cst_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = pdiv(px, psub(qx, px)); + x = pmadd(cst_2, x, cst_1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + //return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); + return pmax(pldexp(x,fx), _x); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index acabc9f1d..c3159de09 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -32,78 +32,11 @@ Packet4f pexp(const Packet4f& _x) { return pexp_float(_x); } + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) +Packet2d pexp(const Packet2d& x) { - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); + return pexp_double(x); } /* evaluation of 4 sines at once, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 1258f349f..cb0f9f2c5 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -338,6 +338,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } @@ -356,27 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) Packet4f tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ Packet4f mask = _mm_cmpgt_ps(tmp, a); - mask = _mm_and_ps(mask, cst_1); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +// WARNING: this pfloor implementation makes sense for small inputs only, +// It is currently only used by pexp and not exposed through HasFloor. +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet4i emm0 = _mm_cvttpd_epi32(a); + Packet2d tmp = _mm_cvtepi32_pd(emm0); + /* if greater, substract 1 */ + Packet2d mask = _mm_cmpgt_pd(tmp, a); + mask = pand(mask, cst_1); return psub(tmp, mask); } #endif -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } - template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } @@ -557,6 +569,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, cons return pldexp_float(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); + Packet4i emm0 = _mm_cvttpd_epi32(exponent); + emm0 = padd(emm0, cst_1023_0); + emm0 = _mm_slli_epi32(emm0, 20); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); + return pmul(a, Packet2d(_mm_castsi128_pd(emm0))); +} + // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template<> EIGEN_STRONG_INLINE void From 7655a8af6e780eea2874df6ef8eb57af4d9fd653 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Nov 2018 23:21:29 +0100 Subject: [PATCH 024/295] cleanup --- Eigen/src/Core/arch/AVX/MathFunctions.h | 2 +- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 44 +-------------------- Eigen/src/Core/arch/NEON/MathFunctions.h | 4 -- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 +- 4 files changed, 3 insertions(+), 49 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 60d6aad40..42758fe07 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, and exp functions of this file are loosely derived from +/* The sin and cos functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 4f1dc80ad..90b128466 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,7 +9,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, and exp functions of this file come from +/* The exp function of this file comes from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -22,48 +22,6 @@ namespace Eigen { namespace internal { -static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); -static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); -static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -static _EIGEN_DECLARE_CONST_Packet4i(23, 23); - -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - -/* the smallest non denormalized float number */ -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); - -/* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 -*/ -static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - -static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - #ifdef __VSX__ static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index f3f6c279e..addaacb9a 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,10 +5,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The exp function of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index c3159de09..e2046be47 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,7 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos and exp functions of this file come from +/* The sin and cos and functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ From b5695a60085e775e87a51114fd7baa30553c79cf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Nov 2018 13:53:05 +0100 Subject: [PATCH 025/295] Unify Altivec/VSX pexp(double) with default implementation --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 118 +------------------- Eigen/src/Core/arch/AltiVec/PacketMath.h | 53 +++++++++ 2 files changed, 54 insertions(+), 117 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 90b128466..acf665018 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,10 +9,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The exp function of this file comes from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H @@ -22,35 +18,6 @@ namespace Eigen { namespace internal { -#ifdef __VSX__ -static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); -static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); -static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - -static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); -static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - -#ifdef __POWER8_VECTOR__ -static Packet2l p2l_1023 = { 1023, 1023 }; -static Packet2ul p2ul_52 = { 52, 52 }; -#endif - -#endif - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { @@ -92,93 +59,10 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -// VSX support varies between different compilers and even different -// versions of the same compiler. For gcc version >= 4.9.3, we can use -// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use -// a slow version that works with older compilers. -// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles -// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 -static inline Packet2l ConvertToPacket2l(const Packet2d& x) { -#if EIGEN_GNUC_AT_LEAST(5, 4) || \ - (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) - return vec_cts(x, 0); // TODO: check clang version. -#else - double tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2l l = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return l; -#endif -} - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { - Packet2d x = _x; - - Packet2d tmp, fx; - Packet2l emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = ConvertToPacket2l(fx); - -#ifdef __POWER8_VECTOR__ - emm0 = vec_add(emm0, p2l_1023); - emm0 = vec_sl(emm0, p2ul_52); -#else - // Code is a bit complex for POWER7. There is actually a - // vec_xxsldi intrinsic but it is not supported by some gcc versions. - // So we shift (52-32) bits and do a word swap with zeros. - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32 - - Packet4i emm04i = reinterpret_cast(emm0); - emm04i = vec_add(emm04i, p4i_1023); - emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); - static const Packet16uc perm = { - 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, - 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; -#ifdef _BIG_ENDIAN - emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); -#else - emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); -#endif - -#endif - - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), - isnumber_mask); + return pexp_double(_x); } #endif diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 3f6253fa2..d64550d8a 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1024,6 +1024,59 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +static inline Packet2l ConvertToPacket2l(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + + // build 2^n + Packet2l emm0 = ConvertToPacket2l(exponent); + +#ifdef __POWER8_VECTOR__ + const Packet2l p2l_1023 = { 1023, 1023 }; + const Packet2ul p2ul_52 = { 52, 52 }; + emm0 = vec_add(emm0, p2l_1023); + emm0 = vec_sl(emm0, p2ul_52); +#else + // Code is a bit complex for POWER7. There is actually a + // vec_xxsldi intrinsic but it is not supported by some gcc versions. + // So we shift (52-32) bits and do a word swap with zeros. + const Packet4i p4i_1023 = pset1(1023); + const Packet4i p4i_20 = pset1(20); // 52 - 32 + + Packet4i emm04i = reinterpret_cast(emm0); + emm04i = vec_add(emm04i, p4i_1023); + emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; +#ifdef _BIG_ENDIAN + emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); +#else + emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); +#endif + +#endif + + return pmul(a, reinterpret_cast(emm0)); +} + template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { Packet2d b, sum; From 7b1cb8a4407bdf243a581133847977fd812b66b7 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 27 Nov 2018 11:11:02 -0500 Subject: [PATCH 026/295] fix the build on 64-bit ARM when NEON is disabled --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b8b83c320..5619a4588 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -853,7 +853,7 @@ protected: }; -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON template<> struct gebp_traits From fa7fd61edad765608beb629a2c6f656535188db6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Nov 2018 22:41:51 +0100 Subject: [PATCH 027/295] Unify SSE/AVX psin functions. It is based on the SSE version which is much more accurate, though very slightly slower. This changeset also includes the following required changes: - add packet-float to packet-int type traits - add packet float<->int reinterpret casts - add faster pselect for AVX based on blendv --- Eigen/Core | 8 +- Eigen/src/Core/GenericPacketMath.h | 9 ++ Eigen/src/Core/arch/AVX/MathFunctions.h | 62 +----------- Eigen/src/Core/arch/AVX/PacketMath.h | 39 +++++++- Eigen/src/Core/arch/AVX/TypeCasting.h | 10 +- .../arch/Default/GenericPacketMathFunctions.h | 80 ++++++++++++++++ Eigen/src/Core/arch/MSA/MathFunctions.h | 4 +- Eigen/src/Core/arch/SSE/MathFunctions.h | 96 +------------------ Eigen/src/Core/arch/SSE/PacketMath.h | 24 ++++- Eigen/src/Core/arch/SSE/TypeCasting.h | 7 ++ 10 files changed, 171 insertions(+), 168 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 0c09f7c79..61cc646aa 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -161,27 +161,27 @@ using std::ptrdiff_t; #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" - #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/TypeCasting.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/MathFunctions.h" #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" - #include "src/Core/arch/NEON/TypeCasting.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e8e7fa4d3..49a1c67cf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -151,6 +151,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const return static_cast(a); } +/** \internal \returns reinterpret_cast(a) */ +template +EIGEN_DEVICE_FUNC inline Target +preinterpret(const Packet& a); /* { return reinterpret_cast(a); } */ + /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, @@ -214,6 +219,10 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (!b); } +/** \internal \returns \a a shifted by n bits */ +template EIGEN_DEVICE_FUNC inline Packet +pshiftleft(const Packet& a, int n); /* { return a << n; } */ + /** \internal \returns the significant and exponent of the underlying floating point numbers * See https://en.cppreference.com/w/cpp/numeric/math/frexp */ diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 42758fe07..4666ccc42 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -36,67 +36,7 @@ inline Packet8i pshiftleft(Packet8i v, int n) template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin(const Packet8f& _x) { - Packet8f x = _x; - - // Some useful values. - _EIGEN_DECLARE_CONST_Packet8i(one, 1); - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); - _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); - - // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. - Packet8f z = pmul(x, p8f_one_over_pi); - Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); - x = pmadd(shift, p8f_neg_pi_first, x); - x = pmadd(shift, p8f_neg_pi_second, x); - x = pmadd(shift, p8f_neg_pi_third, x); - z = pmul(x, p8f_four_over_pi); - - // Make a mask for the entries that need flipping, i.e. wherever the shift - // is odd. - Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); - Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); - - // Create a mask for which interpolant to use, i.e. if z > 1, then the mask - // is set to ones for that entry. - Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); - - // Evaluate the polynomial for the interval [1,3] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); - Packet8f z_minus_two = psub(z, p8f_two); - Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); - Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); - right = pmadd(right, z_minus_two2, p8f_coeff_right_2); - right = pmadd(right, z_minus_two2, p8f_coeff_right_0); - - // Evaluate the polynomial for the interval [-1,1] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); - Packet8f z2 = pmul(z, z); - Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); - left = pmadd(left, z2, p8f_coeff_left_3); - left = pmadd(left, z2, p8f_coeff_left_1); - left = pmul(left, z); - - // Assemble the results, i.e. select the left and right polynomials. - left = _mm256_andnot_ps(ival_mask, left); - right = _mm256_and_ps(ival_mask, right); - Packet8f res = _mm256_or_ps(left, right); - - // Flip the sign on the odd intervals and return the result. - res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); - return res; + return psin_float(_x); } template <> diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a415f2f1b..0e1044aba 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -113,8 +113,17 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet8i integer_packet; + enum {size=8, alignment=Aligned32}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=4, alignment=Aligned32}; +}; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } @@ -125,6 +134,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int fro template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); } template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -210,6 +220,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8 template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi32(a,b); +#else + __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } @@ -231,6 +251,21 @@ template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) +{ return _mm256_blendv_ps(b,a,mask); } +template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) +{ return _mm256_blendv_pd(b,a,mask); } + +template<> EIGEN_STRONG_INLINE Packet8i pshiftleft(const Packet8i& a, int n) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(a, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 83bfdc604..7d2e1e67f 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -37,13 +37,21 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvtps_epi32(a); + return _mm256_cvttps_epi32(a); } template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { return _mm256_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { + return _mm256_castps_si256(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet8i& a) { + return _mm256_castsi256_ps(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index e05e67703..5719d7f91 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -229,5 +229,85 @@ Packet pexp_double(const Packet _x) return pmax(pldexp(x,fx), _x); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& _x) +{ + typedef typename unpacket_traits::integer_packet PacketI; + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + + const PacketI csti_1 = pset1(1); + const PacketI csti_not1 = pset1(~1); + const PacketI csti_2 = pset1(2); + + const Packet cst_sign_mask = pset1frombits(0x80000000u); + + const Packet cst_minus_cephes_DP1 = pset1(-0.78515625f); + const Packet cst_minus_cephes_DP2 = pset1(-2.4187564849853515625e-4f); + const Packet cst_minus_cephes_DP3 = pset1(-3.77489497744594108e-8f); + const Packet cst_sincof_p0 = pset1(-1.9515295891E-4f); + const Packet cst_sincof_p1 = pset1( 8.3321608736E-3f); + const Packet cst_sincof_p2 = pset1(-1.6666654611E-1f); + const Packet cst_coscof_p0 = pset1( 2.443315711809948E-005f); + const Packet cst_coscof_p1 = pset1(-1.388731625493765E-003f); + const Packet cst_coscof_p2 = pset1( 4.166664568298827E-002f); + const Packet cst_cephes_FOPI = pset1( 1.27323954473516f); // 4 / M_PI + + Packet x = pabs(_x); + + // Scale x by 4/Pi to find x's octant. + Packet y = pmul(x, cst_cephes_FOPI); + + // Get the octant. We'll reduce x by this number of octants or by one more than it. + PacketI y_int = pcast(y); + // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. + // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. + // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). + PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...) + y = pcast(y_int1); + + // Compute the sign to apply to the polynomial. + // sign = third_bit(y_int1) xor signbit(_x) + Packet sign_bit = pxor(_x, preinterpret(pshiftleft(y_int1, 29))); + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + + // Get the polynomial selection mask from the second bit of y_int1 + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1))); + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + x = pmadd(y, cst_minus_cephes_DP1, x); + x = pmadd(y, cst_minus_cephes_DP2, x); + x = pmadd(y, cst_minus_cephes_DP3, x); + + Packet x2 = pmul(x,x); + + // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4) + Packet y1 = cst_coscof_p0; + y1 = pmadd(y1, x2, cst_coscof_p1); + y1 = pmadd(y1, x2, cst_coscof_p2); + y1 = pmul(y1, x2); + y1 = pmul(y1, x2); + y1 = psub(y1, pmul(x2, cst_half)); + y1 = padd(y1, cst_1); + + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) + Packet y2 = cst_sincof_p0; + y2 = pmadd(y2, x2, cst_sincof_p1); + y2 = pmadd(y2, x2, cst_sincof_p2); + y2 = pmul(y2, x2); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynoms. + y = pselect(poly_mask,y2,y1); + + // Update the sign + return pxor(y, sign_bit); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h index 98e23e36f..f5181b90e 100644 --- a/Eigen/src/Core/arch/MSA/MathFunctions.h +++ b/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -261,7 +261,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) { // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); - Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); + Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear y = __builtin_msa_ffint_s_w(y_int2); // Compute the sign to apply to the polynomial. @@ -305,7 +305,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) { // Update the sign. sign_mask = pxor(sign_mask, (Packet4i)y); - y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); + y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left return y; } diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index e2046be47..9e699244e 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -54,101 +54,7 @@ Packet2d pexp(const Packet2d& x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=2, alignment=Aligned16}; +}; +template<> struct unpacket_traits { + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16}; +}; #ifndef EIGEN_VECTORIZE_AVX template<> struct scalar_div_cost { enum { value = 7 }; }; @@ -184,6 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int fro template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) @@ -338,6 +352,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } @@ -354,6 +370,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pshiftleft(const Packet4i& a, int n) { return _mm_slli_epi32(a,n); } + #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index c6ca8c716..f607366f0 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return _mm_castps_si128(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return _mm_castsi128_ps(a); +} } // end namespace internal From a1a5fbbd212fb962d2bcc1533ae09037ee4177a1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Nov 2018 22:57:30 +0100 Subject: [PATCH 028/295] Update pshiftleft to pass the shift as a true compile-time integer. --- Eigen/src/Core/GenericPacketMath.h | 8 +++++--- Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 49a1c67cf..c24268443 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -219,9 +219,11 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (!b); } -/** \internal \returns \a a shifted by n bits */ -template EIGEN_DEVICE_FUNC inline Packet -pshiftleft(const Packet& a, int n); /* { return a << n; } */ +/** \internal \returns \a a shifted by N bits */ +template EIGEN_DEVICE_FUNC inline int +pshiftleft(const int& a) { return a << N; } +template EIGEN_DEVICE_FUNC inline long int +pshiftleft(const long int& a) { return a << N; } /** \internal \returns the significant and exponent of the underlying floating point numbers * See https://en.cppreference.com/w/cpp/numeric/math/frexp diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 0e1044aba..969f68d79 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -256,12 +256,12 @@ template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) { return _mm256_blendv_pd(b,a,mask); } -template<> EIGEN_STRONG_INLINE Packet8i pshiftleft(const Packet8i& a, int n) { +template EIGEN_STRONG_INLINE Packet8i pshiftleft(const Packet8i& a) { #ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(a, n); + return _mm256_slli_epi32(a, N); #else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), n); + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N); return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); #endif } diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 5719d7f91..067d1dbe0 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -270,7 +270,7 @@ Packet psin_float(const Packet& _x) // Compute the sign to apply to the polynomial. // sign = third_bit(y_int1) xor signbit(_x) - Packet sign_bit = pxor(_x, preinterpret(pshiftleft(y_int1, 29))); + Packet sign_bit = pxor(_x, preinterpret(pshiftleft<29>(y_int1))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit // Get the polynomial selection mask from the second bit of y_int1 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a508ce73e..fbc69ef1f 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -370,7 +370,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } -template<> EIGEN_STRONG_INLINE Packet4i pshiftleft(const Packet4i& a, int n) { return _mm_slli_epi32(a,n); } +template EIGEN_STRONG_INLINE Packet4i pshiftleft(const Packet4i& a) { return _mm_slli_epi32(a,N); } #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } From b131a4db2439ea1ca4ba86cbc86aa962914915c5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Nov 2018 23:45:00 +0100 Subject: [PATCH 029/295] bug #1631: fix compilation with ARM NEON and clang, and cleanup the weird pshiftright_and_cast and pcast_and_shiftleft functions. --- Eigen/Core | 4 +- Eigen/src/Core/GenericPacketMath.h | 40 ++++------- Eigen/src/Core/arch/AVX/PacketMath.h | 35 ++++------ Eigen/src/Core/arch/AltiVec/PacketMath.h | 66 +++++++++++++++---- .../arch/Default/GenericPacketMathFunctions.h | 20 ++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 28 ++++---- Eigen/src/Core/arch/NEON/TypeCasting.h | 8 +++ Eigen/src/Core/arch/SSE/PacketMath.h | 13 +--- 8 files changed, 124 insertions(+), 90 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 61cc646aa..4bbde063a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -163,11 +163,11 @@ using std::ptrdiff_t; #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index c24268443..cc044de22 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -219,7 +219,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (!b); } -/** \internal \returns \a a shifted by N bits */ +/** \internal \returns \a a shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +pshiftright(const int& a) { return a >> N; } +template EIGEN_DEVICE_FUNC inline long int +pshiftright(const long int& a) { return a >> N; } + +/** \internal \returns \a a shifted by N bits to the left */ template EIGEN_DEVICE_FUNC inline int pshiftleft(const int& a) { return a << N; } template EIGEN_DEVICE_FUNC inline long int @@ -654,41 +660,17 @@ pinsertlast(const Packet& a, typename unpacket_traits::type b) * Some generic implementations to be used by implementors ***************************************************************************/ -/** \internal shift the bits by n and cast the result to the initial type, i.e.: - * return float(reinterpret_cast(a) >> n) - */ -template EIGEN_DEVICE_FUNC inline Packet -pshiftright_and_cast(Packet a, int n); - /** Default implementation of pfrexp for float. - * It is expected to be called by implementers of template<> pfrexp, - * and the above pshiftright_and_cast function must be implemented. + * It is expected to be called by implementers of template<> pfrexp. */ template EIGEN_STRONG_INLINE Packet -pfrexp_float(const Packet& a, Packet& exponent) { - const Packet cst_126f = pset1(126.0f); - const Packet cst_half = pset1(0.5f); - const Packet cst_inv_mant_mask = pset1frombits(~0x7f800000u); - exponent = psub(pshiftright_and_cast(a,23), cst_126f); - return por(pand(a, cst_inv_mant_mask), cst_half); -} - -/** \internal shift the bits by n and cast the result to the initial type, i.e.: - * return reinterpret_cast(int(a) >> n) - */ -template EIGEN_DEVICE_FUNC inline Packet -pcast_and_shiftleft(Packet a, int n); +pfrexp_float(const Packet& a, Packet& exponent); /** Default implementation of pldexp for float. - * It is expected to be called by implementers of template<> pldexp, - * and the above pcast_and_shiftleft function must be implemented. + * It is expected to be called by implementers of template<> pldexp. */ template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent) { - const Packet cst_127 = pset1(127.f); - // return a * 2^exponent - return pmul(a, pcast_and_shiftleft(padd(exponent, cst_127), 23)); -} +pldexp_float(Packet a, Packet exponent); } // end namespace internal diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 969f68d79..4c1abe43f 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -256,7 +256,17 @@ template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) { return _mm256_blendv_pd(b,a,mask); } -template EIGEN_STRONG_INLINE Packet8i pshiftleft(const Packet8i& a) { +template EIGEN_STRONG_INLINE Packet8i pshiftright(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi32(a, N); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i pshiftleft(Packet8i a) { #ifdef EIGEN_VECTORIZE_AVX2 return _mm256_slli_epi32(a, N); #else @@ -409,33 +419,10 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } -template<> EIGEN_STRONG_INLINE Packet8f pshiftright_and_cast(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet8f pcast_and_shiftleft(Packet8f v, int n) -{ - Packet8i vi = _mm256_cvttps_epi32(v); -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_castsi256_ps(_mm256_slli_epi32(vi, n)); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(vi, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(vi, 1), n); - return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif -} - template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { return pldexp_float(a,exponent); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d64550d8a..446065fb7 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -187,8 +187,19 @@ template<> struct packet_traits : default_packet_traits }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16}; +}; +template<> struct unpacket_traits +{ + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16}; +}; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) { @@ -567,21 +578,15 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { - return vec_ctf(vec_sr(reinterpret_cast(a), - reinterpret_cast(pset1(n))),0); -} +template EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) +{ return vec_sr(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) +{ return vec_sl(a,reinterpret_cast(pset1(N))); } template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) -{ - Packet4i vi = vec_cts(v,0); - return reinterpret_cast(vec_sl(vi, reinterpret_cast(pset1(n)))); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_float(a,exponent); } @@ -807,6 +812,43 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons } +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vec_cts(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return reinterpret_cast(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return reinterpret_cast(a); +} + + + //---------- double ---------- #ifdef __VSX__ typedef __vector double Packet2d; diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 067d1dbe0..465f9bc3e 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -16,6 +16,26 @@ namespace Eigen { namespace internal { +template EIGEN_STRONG_INLINE Packet +pfrexp_float(const Packet& a, Packet& exponent) { + typedef typename unpacket_traits::integer_packet PacketI; + const Packet cst_126f = pset1(126.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_inv_mant_mask = pset1frombits(~0x7f800000u); + exponent = psub(pcast(pshiftright<23>(preinterpret(a))), cst_126f); + return por(pand(a, cst_inv_mant_mask), cst_half); +} + +template EIGEN_STRONG_INLINE Packet +pldexp_float(Packet a, Packet exponent) +{ + typedef typename unpacket_traits::integer_packet PacketI; + const Packet cst_127 = pset1(127.f); + // return a * 2^exponent + PacketI ei = pcast(padd(exponent, cst_127)); + return pmul(a, preinterpret(pshiftleft<23>(ei))); +} + // Natural logarithm // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index dc432f0d2..0a50153c7 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -140,8 +140,19 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16}; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16}; +}; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } @@ -294,6 +305,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } +template EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return vshrq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return vshlq_n_s32(a,N); } + template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } @@ -384,20 +398,10 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { - return vcvtq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a),n)); -} - template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) -{ - Packet4i vi = vcvtq_s32_f32(v); - return vreinterpretq_f32_s32(vshlq_n_s32(vi, n)); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_float(a,exponent); } diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 95d1fd0e4..20dbe1332 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -41,6 +41,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i return vcvtq_f32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return vreinterpretq_s32_f32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return vreinterpretq_f32_s32(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index fbc69ef1f..800eb4d86 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -370,7 +370,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } -template EIGEN_STRONG_INLINE Packet4i pshiftleft(const Packet4i& a) { return _mm_slli_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _mm_srli_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); } #ifdef EIGEN_VECTORIZE_SSE4_1 template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } @@ -569,20 +570,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } -template<> EIGEN_STRONG_INLINE Packet4f pshiftright_and_cast(Packet4f a, int n) { - return _mm_cvtepi32_ps(_mm_srli_epi32(_mm_castps_si128(a),n)); -} - template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pcast_and_shiftleft(Packet4f v, int n) -{ - Packet4i vi = _mm_cvttps_epi32(v); - return _mm_castsi128_ps(_mm_slli_epi32(vi, n)); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_float(a,exponent); } From a4159dba080f5621f19f814440553ba734c8e712 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 27 Nov 2018 16:53:14 -0500 Subject: [PATCH 030/295] do not read buffers out of bounds -- load only the 4 bytes we know exist here. Could also have done a vld1_lane_f32 but doing so here, without the overhead of initializing the unused lane, would have triggered used-of-uninitialized-value errors in tools such as ASan. Note that this code is sub-optimal before or after this change: we should be reading either 2 or 4 float32 values per load-instruction (2 for ARM in-order cores with an affinity for 8-byte loads; 4 for ARM out-of-order cores able to dual-issue 16-byte load instructions with arithmetic instructions). Before or after this patch, we are only loading 4 bytes of useful data here (even if before this patch, we were technically loading 8, only to use only the 4 first). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 5619a4588..9ca865bd1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -859,7 +859,7 @@ template<> struct gebp_traits : gebp_traits { - typedef float32x2_t RhsPacket; + typedef float RhsPacket; EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { @@ -871,7 +871,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = vld1_f32(b); + dest = *b; } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const @@ -881,7 +881,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const { - c = vfmaq_lane_f32(c, a, b, 0); + c = vfmaq_n_f32(c, a, b); } }; From 80f1651f3579cd45c6874c64a0439fa32928aa64 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 27 Nov 2018 17:25:49 -0800 Subject: [PATCH 031/295] Use explicit packet type in SSE/PacketMath pldexp --- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 800eb4d86..1a2710a3d 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -579,7 +579,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, cons } template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - const __m128i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); + const Packet4i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); Packet4i emm0 = _mm_cvttpd_epi32(exponent); emm0 = padd(emm0, cst_1023_0); emm0 = _mm_slli_epi32(emm0, 20); From 48fe78c375a60696d09b3815cd705366b937e03c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Nov 2018 13:15:06 +0100 Subject: [PATCH 032/295] bug #1630: fix linspaced when requesting smaller packet size than default one. --- Eigen/src/Core/CwiseNullaryOp.h | 10 +++---- Eigen/src/Core/DenseBase.h | 4 +-- Eigen/src/Core/functors/NullaryFunctors.h | 34 +++++++++++------------ test/nullary.cpp | 25 +++++++++++------ 4 files changed, 41 insertions(+), 32 deletions(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index d149abe93..ef708197b 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomA DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&) @@ -252,7 +252,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** @@ -283,7 +283,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomA DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** @@ -296,7 +296,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ @@ -398,7 +398,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); } /** diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 2a0927317..f8feefa27 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -266,9 +266,9 @@ template class DenseBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */ - typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ - typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index b03be0269..f5888abf0 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,10 +37,10 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), @@ -56,7 +56,7 @@ struct linspaced_op_impl return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: @@ -86,8 +86,8 @@ struct linspaced_op_impl const bool m_flip; }; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), @@ -115,8 +115,8 @@ struct linspaced_op_impl // Forward declaration (we default to random access which does not really give // us a speed gain when using packet access but it allows to use the functor in // nested expressions). -template struct linspaced_op; -template struct functor_traits< linspaced_op > +template struct linspaced_op; +template struct functor_traits< linspaced_op > { enum { @@ -126,7 +126,7 @@ template struct functor_traits< linspaced IsRepeatable = true }; }; -template struct linspaced_op +template struct linspaced_op { linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low),high,num_steps) @@ -136,11 +136,11 @@ template struct linspaced_op EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp(i); } // This proxy object handles the actual required temporaries and the different // implementations (integer vs. floating point). - const linspaced_op_impl::IsInteger> impl; + const linspaced_op_impl::IsInteger> impl; }; // Linear access is automatically determined from the operator() prototypes available for the given functor. @@ -166,12 +166,12 @@ struct has_unary_operator,IndexType> { enum { value = template struct has_binary_operator,IndexType> { enum { value = 1}; }; -template -struct has_nullary_operator,IndexType> { enum { value = 0}; }; -template -struct has_unary_operator,IndexType> { enum { value = 1}; }; -template -struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; template struct has_nullary_operator,IndexType> { enum { value = 1}; }; diff --git a/test/nullary.cpp b/test/nullary.cpp index 12b9e122f..1df3693d6 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -246,6 +246,14 @@ void bug79() VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits::epsilon() ); } +template +void bug1630() +{ + Array4d x4 = Array4d::LinSpaced(0.0, 1.0); + Array3d x3(Array4d::LinSpaced(0.0, 1.0).head(3)); + VERIFY_IS_APPROX(x4.head(3), x3); +} + template void nullary_overflow() { @@ -272,10 +280,10 @@ void nullary_internal_logic() VERIFY(( internal::has_binary_operator >::value )); VERIFY(( !internal::functor_has_linear_access >::ret )); - VERIFY(( !internal::has_nullary_operator >::value )); - VERIFY(( internal::has_unary_operator >::value )); - VERIFY(( !internal::has_binary_operator >::value )); - VERIFY(( internal::functor_has_linear_access >::ret )); + VERIFY(( !internal::has_nullary_operator >::value )); + VERIFY(( internal::has_unary_operator >::value )); + VERIFY(( !internal::has_binary_operator >::value )); + VERIFY(( internal::functor_has_linear_access >::ret )); // Regression unit test for a weird MSVC bug. // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details. @@ -296,10 +304,10 @@ void nullary_internal_logic() VERIFY(( !internal::has_binary_operator >::value )); VERIFY(( internal::functor_has_linear_access >::ret )); - VERIFY(( !internal::has_nullary_operator >::value )); - VERIFY(( internal::has_unary_operator >::value )); - VERIFY(( !internal::has_binary_operator >::value )); - VERIFY(( internal::functor_has_linear_access >::ret )); + VERIFY(( !internal::has_nullary_operator >::value )); + VERIFY(( internal::has_unary_operator >::value )); + VERIFY(( !internal::has_binary_operator >::value )); + VERIFY(( internal::functor_has_linear_access >::ret )); } } @@ -325,6 +333,7 @@ EIGEN_DECLARE_TEST(nullary) } CALL_SUBTEST_6( bug79<0>() ); + CALL_SUBTEST_6( bug1630<0>() ); CALL_SUBTEST_9( nullary_overflow<0>() ); CALL_SUBTEST_10( nullary_internal_logic<0>() ); } From aa6097395bc9147b771b33d4cbcb55e55ed409a7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Nov 2018 16:09:08 +0100 Subject: [PATCH 033/295] Add missing SSE/AVX type-casting in AVX512 mode --- Eigen/Core | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index 4bbde063a..41529bb63 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -153,7 +153,9 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" From 3e95e398b6cee0218c0a01880fecd6215a7f0d28 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Nov 2018 17:14:20 +0100 Subject: [PATCH 034/295] pmin/pmax o SSE: make sure to use AVX instruction with AVX enabled, and disable gcc workaround for fixed gcc versions --- Eigen/src/Core/arch/SSE/PacketMath.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 1a2710a3d..004c09f5a 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -270,13 +270,18 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& #endif template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else Packet4f res = b; asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif return res; #else // Arguments are reversed to match NaN propagation behavior of std::min. @@ -284,13 +289,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else Packet2d res = b; asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif return res; #else // Arguments are reversed to match NaN propagation behavior of std::min. From 41052f63b7b4cc3c3b4de89b3ed49948b845da84 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Nov 2018 17:17:28 +0100 Subject: [PATCH 035/295] same for pmax --- Eigen/src/Core/arch/SSE/PacketMath.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 004c09f5a..0e8e0d2b3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -319,13 +319,18 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const } template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else Packet4f res = b; asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif return res; #else // Arguments are reversed to match NaN propagation behavior of std::max. @@ -333,13 +338,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else Packet2d res = b; asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif return res; #else // Arguments are reversed to match NaN propagation behavior of std::max. From e19ece822de2a516f66712068a2e5fa4fe18150b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Nov 2018 17:56:24 +0100 Subject: [PATCH 036/295] Disable fma gcc's workaround for gcc >= 8 (based on GEMM benchmarks) --- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 4c1abe43f..abfeb4338 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -174,7 +174,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co #ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, // and gcc stupidly generates a vfmadd132ps instruction, // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate @@ -187,7 +187,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); From b477d60bc604dd8970380e252f8ed3a6021bc081 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 11:26:30 +0100 Subject: [PATCH 037/295] Extend the generic psin_float code to handle cosine and make SSE and AVX use it (-> this adds pcos for AVX) --- Eigen/src/Core/arch/AVX/MathFunctions.h | 10 +- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- .../arch/Default/GenericPacketMathFunctions.h | 38 +++++++- Eigen/src/Core/arch/SSE/MathFunctions.h | 96 +------------------ 4 files changed, 41 insertions(+), 105 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 4666ccc42..4b3a553da 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -29,16 +29,18 @@ inline Packet8i pshiftleft(Packet8i v, int n) #endif } -// Sine function -// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and -// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants -// are (anti-)symmetric and thus have only odd/even coefficients template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin(const Packet8f& _x) { return psin_float(_x); } +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pcos(const Packet8f& _x) { + return pcos_float(_x); +} + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog(const Packet8f& _x) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index abfeb4338..284a32676 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -63,7 +63,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasSin = EIGEN_FAST_MATH, - HasCos = 0, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 1, diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 465f9bc3e..9481850c6 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -245,14 +245,23 @@ Packet pexp_double(const Packet _x) // Construct the result 2^n * exp(g) = e * x. The max is used to catch // non-finite values in the input. - //return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); return pmax(pldexp(x,fx), _x); } -template +/* The code is the rewriting of the cephes sinf/cosf functions. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. +*/ + +template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet psin_float(const Packet& _x) +Packet psincos_float(const Packet& _x) { typedef typename unpacket_traits::integer_packet PacketI; const Packet cst_1 = pset1(1.0f); @@ -261,6 +270,7 @@ Packet psin_float(const Packet& _x) const PacketI csti_1 = pset1(1); const PacketI csti_not1 = pset1(~1); const PacketI csti_2 = pset1(2); + const PacketI csti_3 = pset1(3); const Packet cst_sign_mask = pset1frombits(0x80000000u); @@ -290,7 +300,8 @@ Packet psin_float(const Packet& _x) // Compute the sign to apply to the polynomial. // sign = third_bit(y_int1) xor signbit(_x) - Packet sign_bit = pxor(_x, preinterpret(pshiftleft<29>(y_int1))); + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<29>(y_int1))) + : preinterpret(pshiftleft<29>(padd(y_int1,csti_3))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit // Get the polynomial selection mask from the second bit of y_int1 @@ -323,11 +334,28 @@ Packet psin_float(const Packet& _x) y2 = pmadd(y2, x, x); // Select the correct result from the two polynoms. - y = pselect(poly_mask,y2,y1); + y = ComputeSine ? pselect(poly_mask,y2,y1) + : pselect(poly_mask,y1,y2); // Update the sign return pxor(y, sign_bit); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x) +{ + return psincos_float(x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x) +{ + return psincos_float(x); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 9e699244e..0d491ab88 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -39,110 +39,16 @@ Packet2d pexp(const Packet2d& x) return pexp_double(x); } -/* evaluation of 4 sines at once, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { return psin_float(_x); } -/* almost the same as psin */ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); + return pcos_float(_x); } #if EIGEN_FAST_MATH From f91500d3035fd34683210eea6064b95a7aad4306 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 14:32:06 +0100 Subject: [PATCH 038/295] Fix pandnot order in AVX512 --- Eigen/Core | 2 ++ Eigen/src/Core/arch/AVX512/PacketMath.h | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 41529bb63..bc6cf8a96 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -154,8 +154,10 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 86cefba92..9a053fb1a 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -393,24 +393,24 @@ template <> EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_ps(a, b); + return _mm512_andnot_ps(b, a); #else Packet16f res = _mm512_undefined_ps(); Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); + res = _mm512_insertf32x4(res, pandnot(lane0_a, lane0_b), 0); Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); + res = _mm512_insertf32x4(res, pandnot(lane1_a, lane1_b), 1); Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); + res = _mm512_insertf32x4(res, pandnot(lane2_a, lane2_b), 2); Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); + res = _mm512_insertf32x4(res, pandnot(lane3_a, lane3_b), 3); return res; #endif From c68bd2fa7a52bff1ecbe1a863548760132a11a49 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 14:32:31 +0100 Subject: [PATCH 039/295] Cleanup --- Eigen/src/Core/arch/AVX/MathFunctions.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index 4b3a553da..9f375ed98 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -18,17 +18,6 @@ namespace Eigen { namespace internal { -inline Packet8i pshiftleft(Packet8i v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(v, n); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin(const Packet8f& _x) { From fa87f9d876f38e470e5070a451f92a3c19c9d0fe Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 14:33:13 +0100 Subject: [PATCH 040/295] Add psin/pcos on AVX512 -> almost for free, at last! --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 13 +++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 34 ++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 93c5ec43f..aac707596 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -373,6 +373,19 @@ EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { #endif #endif + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +psin(const Packet16f& _x) { + return psin_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +pcos(const Packet16f& _x) { + return pcos_float(_x); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9a053fb1a..7d90ce4c1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -55,6 +55,8 @@ template<> struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, HasBlend = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, @@ -99,6 +101,7 @@ template <> struct unpacket_traits { typedef float type; typedef Packet8f half; + typedef Packet16i integer_packet; enum { size = 16, alignment=Aligned64 }; }; template <> @@ -127,6 +130,11 @@ EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { return _mm512_set1_epi32(from); } +template <> +EIGEN_STRONG_INLINE Packet16f pset1frombits(unsigned int from) { + return _mm512_castsi512_ps(_mm512_set1_epi32(from)); +} + template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { return _mm512_broadcastss_ps(_mm_load_ps1(from)); @@ -254,6 +262,12 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, return _mm512_max_pd(b, a); } +template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { + __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); + __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); + return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); +} + template <> EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) { @@ -434,6 +448,10 @@ EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, #endif } +template EIGEN_STRONG_INLINE Packet16i pshiftleft(Packet16i a) { + return _mm512_slli_epi32(a, N); +} + template <> EIGEN_STRONG_INLINE Packet16f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); @@ -1322,6 +1340,22 @@ template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b) return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b)); } +template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return _mm512_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return _mm512_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { + return _mm512_castps_si512(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { + return _mm512_castsi512_ps(a); +} + } // end namespace internal } // end namespace Eigen From 69ace742be6f00f4280d312e046b0b1422fd112c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 15:56:08 +0100 Subject: [PATCH 041/295] Several improvements regarding packet-bitwise operations: - add unit tests - optimize their AVX512f implementation - add missing implementations (half, Packet4f, ...) --- Eigen/src/Core/GenericPacketMath.h | 20 ++- Eigen/src/Core/arch/AVX/PacketMath.h | 28 +++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 150 ++++++----------------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 26 ++++ test/packetmath.cpp | 42 +++++++ 5 files changed, 140 insertions(+), 126 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index cc044de22..9c2a437bf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -158,13 +158,11 @@ preinterpret(const Packet& a); /* { return reinterpret_cast(a); } /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } +padd(const Packet& a, const Packet& b) { return a+b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } +psub(const Packet& a, const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet @@ -177,23 +175,19 @@ pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } +pmul(const Packet& a, const Packet& b) { return a*b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } +pdiv(const Packet& a, const Packet& b) { return a/b; } /** \internal \returns the min of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } +pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); } /** \internal \returns the max of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } /** \internal \returns the absolute value of \a a */ template EIGEN_DEVICE_FUNC inline Packet @@ -217,7 +211,7 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } /** \internal \returns the bitwise andnot of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } +pandnot(const Packet& a, const Packet& b) { return a & (~b); } /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 284a32676..b49bae0de 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -241,15 +241,43 @@ template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { re template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a,b); +#else + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a,b); +#else + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a,b); +#else + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(b,a); +#else + return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) { return _mm256_blendv_ps(b,a,mask); } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 7d90ce4c1..1d38fb758 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -268,30 +268,20 @@ template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packe return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); } + +template <> +EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, + const Packet16i& b) { + return _mm512_and_si512(a,b); +} + template <> EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_and_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> @@ -312,30 +302,18 @@ EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, return res; #endif } + template <> -EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i por(const Packet16i& a, const Packet16i& b) { + return _mm512_or_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } @@ -345,106 +323,52 @@ EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pxor(const Packet16i& a, const Packet16i& b) { + return _mm512_xor_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } + template <> -EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pandnot(const Packet16i& a, const Packet16i& b) { + return _mm512_andnot_si512(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_andnot_ps(b, a); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, pandnot(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, pandnot(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, pandnot(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, pandnot(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(a, b); + return _mm512_andnot_pd(b, a); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 8787adcde..cdd2b001b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -640,6 +640,19 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1063,6 +1076,19 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 43c33ba94..144083f1b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -27,7 +27,44 @@ bool g_first_pass = true; namespace Eigen { namespace internal { + template T negate(const T& x) { return -x; } + +template +Map > +bits(const T& x) { + return Map >(reinterpret_cast(&x)); +} + +// The following implement bitwise operations on floating point types +template +T apply_bit_op(Bits a, Bits b, Func f) { + Array res; + for(Index i=0; i(&res); +} + +#define EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,T) \ + template<> T EIGEN_CAT(p,OP)(const T& a,const T& b) { \ + return apply_bit_op(bits(a),bits(b),FUNC); \ + } + +#define EIGEN_TEST_MAKE_BITWISE(OP,FUNC) \ + EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,float) \ + EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,double) \ + EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,half) \ + EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex) \ + EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex) + +EIGEN_TEST_MAKE_BITWISE(xor,std::bit_xor()) +EIGEN_TEST_MAKE_BITWISE(and,std::bit_and()) +EIGEN_TEST_MAKE_BITWISE(or, std::bit_or()) +struct bit_andnot{ + template T + operator()(T a, T b) const { return a & (~b); } +}; +EIGEN_TEST_MAKE_BITWISE(andnot, bit_andnot()) + } } @@ -304,6 +341,11 @@ template void packetmath() } } + CHECK_CWISE2_IF(true, internal::por, internal::por); + CHECK_CWISE2_IF(true, internal::pxor, internal::pxor); + CHECK_CWISE2_IF(true, internal::pand, internal::pand); + CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot); + if (PacketTraits::HasBlend) { Packet thenPacket = internal::pload(data1); Packet elsePacket = internal::pload(data2); From c785464430bfc697debe3f8d49e49064aa08e0a3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 16:21:33 +0100 Subject: [PATCH 042/295] Add packet sin and cos to Altivec/VSX and NEON --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 12 ++++++++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 5 +++-- Eigen/src/Core/arch/NEON/MathFunctions.h | 20 ++++++++++++++++---- Eigen/src/Core/arch/NEON/PacketMath.h | 6 ++++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index acf665018..81097e668 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -30,6 +30,18 @@ Packet4f pexp(const Packet4f& _x) return pexp_float(_x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& _x) +{ + return psin_float(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& _x) +{ + return pcos_float(_x); +} + #ifndef EIGEN_COMP_CLANG template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 446065fb7..d0ee93f4a 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -146,8 +146,8 @@ template<> struct packet_traits : default_packet_traits HasMin = 1, HasMax = 1, HasAbs = 1, - HasSin = 0, - HasCos = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, #ifdef __VSX__ @@ -437,6 +437,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f c = reinterpret_cast(vec_cmpge(a,b)); return vec_nor(c,c); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index addaacb9a..2e7d0e944 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -15,15 +15,27 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) +Packet4f pexp(const Packet4f& x) { - return pexp_float(_x); + return pexp_float(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) +Packet4f plog(const Packet4f& x) { - return plog_float(_x); + return plog_float(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& x) +{ + return psin_float(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& x) +{ + return pcos_float(x); } } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 0a50153c7..ed3cec88a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -111,8 +111,8 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasFloor = 1, // FIXME check the Has* - HasSin = 0, - HasCos = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, HasSqrt = 0 @@ -268,6 +268,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return vreinterpretq_s32_u32(vceqq_s32(a,b)); } + template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { const Packet4f cst_1 = pset1(1.0f); From ab4df3e6ff530ab520884bdbf44d61b53b1ee05f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 21:25:51 +0100 Subject: [PATCH 043/295] bug #1634: remove double copy in move-ctor of non movable Matrix/Array --- Eigen/src/Core/Array.h | 2 -- Eigen/src/Core/Matrix.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index e10020d4f..16770fc7b 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -153,8 +153,6 @@ class Array : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 90c336d8c..7f4a7af93 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -274,8 +274,6 @@ class Matrix : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) From 0ea7ae72130cac7334823ec442f0a8a6772c9ab8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 21:52:25 +0100 Subject: [PATCH 044/295] Add missing padd for Packet8i (it was implicitly generated by clang and gcc) --- Eigen/src/Core/arch/AVX/PacketMath.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index b49bae0de..476de4fd4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -144,6 +144,15 @@ template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { retur template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi32(a,b); +#else + __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } From 919414b9fe2ad7fdcb0f2b2cbdf6b5322d0f2034 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 3 Dec 2018 16:18:15 +0100 Subject: [PATCH 045/295] bug #785: Make Cholesky decomposition work for empty matrices --- Eigen/src/Cholesky/LDLT.h | 3 ++- Eigen/src/Core/ConditionEstimator.h | 2 +- test/cholesky.cpp | 16 +++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 2dfeac333..6831eab3d 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -304,7 +304,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + if(size==0) sign = ZeroSign; + else if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index aa7efdc76..8c1a89129 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1); + if (dec.rows() == 0) return RealScalar(1)/RealScalar(0); if (matrix_norm == RealScalar(0)) return RealScalar(0); if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); diff --git a/test/cholesky.cpp b/test/cholesky.cpp index b871351e0..e1e8b7bf7 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -19,6 +19,7 @@ template typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { + if(m.cols()==0) return typename MatrixType::RealScalar(0); MatrixType symm = m.template selfadjointView(); return symm.cwiseAbs().colwise().sum().maxCoeff(); } @@ -96,7 +97,7 @@ template void cholesky(const MatrixType& m) RealScalar rcond_est = chollo.rcond(); // Verify that the estimated condition number is within a factor of 10 of the // truth. - VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10); // test the upper mode LLT cholup(symmUp); @@ -112,12 +113,12 @@ template void cholesky(const MatrixType& m) rcond = (RealScalar(1) / matrix_l1_norm(symmUp)) / matrix_l1_norm(symmUp_inverse); rcond_est = cholup.rcond(); - VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10); MatrixType neg = -symmLo; chollo.compute(neg); - VERIFY(chollo.info()==NumericalIssue); + VERIFY(neg.size()==0 || chollo.info()==NumericalIssue); VERIFY_IS_APPROX(MatrixType(chollo.matrixL().transpose().conjugate()), MatrixType(chollo.matrixU())); VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL())); @@ -166,7 +167,7 @@ template void cholesky(const MatrixType& m) RealScalar rcond_est = ldltlo.rcond(); // Verify that the estimated condition number is within a factor of 10 of the // truth. - VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10); LDLT ldltup(symmUp); @@ -183,7 +184,7 @@ template void cholesky(const MatrixType& m) rcond = (RealScalar(1) / matrix_l1_norm(symmUp)) / matrix_l1_norm(symmUp_inverse); rcond_est = ldltup.rcond(); - VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY(rcond_est >= rcond / 10 && rcond_est <= rcond * 10); VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU())); VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL())); @@ -507,6 +508,11 @@ EIGEN_DECLARE_TEST(cholesky) CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) } + // empty matrix, regression test for Bug 785: + CALL_SUBTEST_2( cholesky(MatrixXd(0,0)) ); + + // This does not work yet: + // CALL_SUBTEST_2( cholesky(Matrix()) ); CALL_SUBTEST_4( cholesky_verify_assert() ); CALL_SUBTEST_7( cholesky_verify_assert() ); From fd0fbfa9b5301e5339c34846c76835cf347ef4cb Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 3 Dec 2018 15:54:10 -0800 Subject: [PATCH 046/295] Do not disable alignment with EIGEN_GPUCC --- Eigen/src/Core/util/ConfigureVectorization.h | 15 +++++++-------- Eigen/src/Core/util/Macros.h | 4 ---- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 263604597..c482a0b14 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -10,13 +10,6 @@ #ifndef EIGEN_CONFIGURE_VECTORIZATION_H #define EIGEN_CONFIGURE_VECTORIZATION_H -// FIXME: not sure why this is needed, perhaps it is not needed anymore. -#ifdef __NVCC__ - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif - //------------------------------------------------------------------------------------------ // Static and dynamic alignment control // @@ -183,7 +176,13 @@ //---------------------------------------------------------------------- - +// If we are compiling for GPU we should also disable vectorization because +// all the packet functions are not marked as __device__ functions. +#ifdef EIGEN_GPUCC +#ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif // if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into // account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9d277e26f..c7dba1fc4 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -742,10 +742,6 @@ // All functions callable from CUDA/HIP code must be qualified with __device__ #ifdef EIGEN_GPUCC - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - #define EIGEN_DEVICE_FUNC __host__ __device__ #else #define EIGEN_DEVICE_FUNC From 0bb15bb6d6a445edb9341c4970d854882347b6d7 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 3 Dec 2018 17:10:40 -0800 Subject: [PATCH 047/295] Update checks in ConfigureVectorization.h --- Eigen/src/Core/util/ConfigureVectorization.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index c482a0b14..c70ad894e 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -49,7 +49,13 @@ // If the user explicitly disable vectorization, then we also disable alignment #if defined(EIGEN_DONT_VECTORIZE) - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 + #if defined(EIGEN_GPUCC) + // GPU code is always vectorized and requires memory alignment for + // statically allocated buffers. + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 + #elif + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 + #endif #elif defined(__AVX512F__) // 64 bytes static alignment is preferred only if really required #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 @@ -176,14 +182,6 @@ //---------------------------------------------------------------------- -// If we are compiling for GPU we should also disable vectorization because -// all the packet functions are not marked as __device__ functions. -#ifdef EIGEN_GPUCC -#ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif - // if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into // account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks #if EIGEN_MAX_ALIGN_BYTES==0 @@ -210,7 +208,7 @@ #endif -#ifndef EIGEN_DONT_VECTORIZE +#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) From 36f8f6d0be1543e12c87c6f33df46fe7bcecab87 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Wed, 5 Dec 2018 12:29:03 +0100 Subject: [PATCH 048/295] Fix evalShardedByInnerDim for AVX512 builds evalShardedByInnerDim ensures that the values it passes for start_k and end_k to evalGemmPartialWithoutOutputKernel are multiples of 8 as the kernel does not work correctly when the values of k are not multiples of the packet_size. While this precaution works for AVX builds, it is insufficient for AVX512 builds where the maximum packet size is 16. The result is slightly incorrect float32 contractions on AVX512 builds. This commit fixes the problem by ensuring that k is always a multiple of the packet_size if the packet_size is > 8. --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 24ba3e431..3946e2fc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -788,9 +788,11 @@ struct TensorEvaluatorm_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; - // The underlying GEMM kernel assumes that k is a multiple of 8 and - // subtle breakage occurs if this is violated. - Index block_size = 8 * divup(k, 8 * num_threads); + const Index packet_size = internal::packet_traits::size; + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + Index block_size = kmultiple * divup(k, kmultiple * num_threads); Index num_blocks = divup(k, block_size); // we use 'result' for the first block's partial result. MaxSizeVector block_buffers(num_blocks - 1); @@ -805,9 +807,9 @@ struct TensorEvaluator 0; --blocks_left) { // The underlying GEMM kernel assumes that k is a multiple of packet size - // (currently largest packet size is 8) and subtle breakage occurs if + // (currently largest packet size is 16) and subtle breakage occurs if // this is violated. - block_size = 8 * divup(k - start, 8 * blocks_left); + block_size = kmultiple * divup(k - start, kmultiple * blocks_left); Scalar* buf; if (start == 0) { buf = result; From c1d356e8b4b84fa5c7172567d5529f52191ed85b Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 5 Dec 2018 15:01:04 +0100 Subject: [PATCH 049/295] bug #1635: Use infinity from Numtraits instead of creating it manually. --- Eigen/src/Core/ConditionEstimator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 8c1a89129..51a2e5f1b 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1)/RealScalar(0); + if (dec.rows() == 0) return NumTraits::infinity(); if (matrix_norm == RealScalar(0)) return RealScalar(0); if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); From e2e897298a670ea8024c8bb76894c45a9941cc1f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 5 Dec 2018 17:13:46 +0100 Subject: [PATCH 050/295] Fix page nesting --- doc/Manual.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/Manual.dox b/doc/Manual.dox index 0992cda9f..194164e97 100644 --- a/doc/Manual.dox +++ b/doc/Manual.dox @@ -108,7 +108,7 @@ namespace Eigen { /** \addtogroup CoeffwiseMathFunctions \ingroup DenseMatrixManipulation_chapter */ -/** \addtogroup SparseQuickRefPage +/** \addtogroup QuickRefPage \ingroup DenseMatrixManipulation_chapter */ From acc3459a49707c92ee96a710e05d7e18e144c780 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 5 Dec 2018 17:17:23 +0100 Subject: [PATCH 051/295] Add help messages in the quick ref/ascii docs regarding slicing, indexing, and reshaping. --- doc/AsciiQuickReference.txt | 11 +++++++++++ doc/QuickReference.dox | 11 +++++++++++ doc/eigendoxy.css | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index 0ca54cef3..18b4446c6 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -50,6 +50,12 @@ VectorXi::LinSpaced(((hi-low)/step)+1, // low:step:hi // Matrix slicing and blocks. All expressions listed here are read/write. // Templated size versions are faster. Note that Matlab is 1-based (a size N // vector is x(1)...x(N)). +/******************************************************************************/ +/* PLEASE HELP US IMPROVING THIS SECTION */ +/* Eigen 3.4 supports a much improved API for sub-matrices, including, */ +/* slicing and indexing from arrays: */ +/* http://eigen.tuxfamily.org/dox-devel/group__TutorialSlicingIndexing.html */ +/******************************************************************************/ // Eigen // Matlab x.head(n) // x(1:n) x.head() // x(1:n) @@ -88,6 +94,11 @@ R.row(i) = P.col(j); // R(i, :) = P(:, j) R.col(j1).swap(mat1.col(j2)); // R(:, [j1 j2]) = R(:, [j2, j1]) // Views, transpose, etc; +/******************************************************************************/ +/* PLEASE HELP US IMPROVING THIS SECTION */ +/* Eigen 3.4 supports a new API for reshaping: */ +/* http://eigen.tuxfamily.org/dox-devel/group__TutorialReshape.html */ +/******************************************************************************/ // Eigen // Matlab R.adjoint() // R' R.transpose() // R.' or conj(R') // Read-write diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 18c90a2a9..9c8e6fb4a 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -529,6 +529,12 @@ if((array1 < array2).any()) ... // if there exist a pair i,j such that array1(i, top\section QuickRef_Blocks Sub-matrices +
+PLEASE HELP US IMPROVING THIS SECTION. +%Eigen 3.4 supports a much improved API for sub-matrices, including, +slicing and indexing from arrays: \ref TutorialSlicingIndexing +
+ Read-write access to a \link DenseBase::col(Index) column \endlink or a \link DenseBase::row(Index) row \endlink of a matrix (or array): \code @@ -584,6 +590,11 @@ Read-write access to sub-matrices: top\section QuickRef_Misc Miscellaneous operations +
+PLEASE HELP US IMPROVING THIS SECTION. +%Eigen 3.4 supports a new API for reshaping: \ref TutorialReshape +
+ \subsection QuickRef_Reverse Reverse Vectors, rows, and/or columns of a matrix can be reversed (see DenseBase::reverse(), DenseBase::reverseInPlace(), VectorwiseOp::reverse()). \code diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css index 6147c7154..6148655f3 100644 --- a/doc/eigendoxy.css +++ b/doc/eigendoxy.css @@ -188,6 +188,13 @@ span.cpp11,span.cpp14,span.cpp17 { font-weight: bold; } +div.warningbox { + max-width:60em; + border-style: solid solid solid solid; + border-color: red; + border-width: 3px; +} + /**** old Eigen's styles ****/ From 47d8b741b22739829d889c5519bd1fc60f39ef21 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 5 Dec 2018 13:19:31 -0800 Subject: [PATCH 052/295] #elif -> #else to fix GPU build. --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index c70ad894e..2ba0bab98 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -53,7 +53,7 @@ // GPU code is always vectorized and requires memory alignment for // statically allocated buffers. #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 - #elif + #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 #endif #elif defined(__AVX512F__) From 1ac2695ef7e1fc8e147a37ad97391d7a2941c696 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 00:05:10 +0100 Subject: [PATCH 053/295] bug #1636: fix compilation with some ABI versions. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index cdd2b001b..f3d721dd7 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -641,16 +641,18 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { } template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = por(a.x,b.x); return r; + // in some cases Packet8i is a wrapper around __m256i, so we need to + // cast to Packet8i to call the correct overload. + Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pxor(a.x,b.x); return r; + Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pand(a.x,b.x); return r; + Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pandnot(a.x,b.x); return r; + Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { @@ -1077,16 +1079,18 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { } template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = por(a.x,b.x); return r; + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pxor(a.x,b.x); return r; + Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pand(a.x,b.x); return r; + Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pandnot(a.x,b.x); return r; + Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } From 3fba59ea594eb26446352cb28813b38921439f23 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 00:13:26 +0100 Subject: [PATCH 054/295] temporarily re-disable SSE/AVX vectorization of complex<> on AVX512 -> this needs to be fixed though! --- Eigen/Core | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index bc6cf8a96..0e23247d3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -154,10 +154,13 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" - #include "src/Core/arch/SSE/Complex.h" + // FIXME: this needs to be fixed (compilation issue if included) + // there is no reason to disable SSE/AVX vectorization + // of complex<> while enabling AVX512. + // #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/AVX/Complex.h" + // #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" From c53eececb0415834b961cb61cd466907261b4b2f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 15:58:06 +0100 Subject: [PATCH 055/295] Implement AVX512 vectorization of std::complex --- Eigen/Core | 8 +- Eigen/src/Core/GenericPacketMath.h | 1 + Eigen/src/Core/arch/AVX/Complex.h | 4 + Eigen/src/Core/arch/AVX512/Complex.h | 442 ++++++++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + Eigen/src/Core/arch/SSE/PacketMath.h | 12 +- .../Core/products/GeneralBlockPanelKernel.h | 167 +++++-- test/packetmath.cpp | 17 +- 8 files changed, 587 insertions(+), 65 deletions(-) create mode 100644 Eigen/src/Core/arch/AVX512/Complex.h diff --git a/Eigen/Core b/Eigen/Core index 0e23247d3..759b1bb80 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -154,14 +154,12 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" - // FIXME: this needs to be fixed (compilation issue if included) - // there is no reason to disable SSE/AVX vectorization - // of complex<> while enabling AVX512. - // #include "src/Core/arch/SSE/Complex.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" - // #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" + #include "src/Core/arch/AVX512/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 9c2a437bf..2b2ee9e2c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -56,6 +56,7 @@ struct default_packet_traits HasConj = 1, HasSetLinear = 1, HasBlend = 0, + HasReduxp = 1, HasDiv = 0, HasSqrt = 0, diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 7fa61969d..2bb40fc79 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -22,6 +22,7 @@ struct Packet4cf __m256 v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -44,6 +45,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; @@ -228,6 +230,7 @@ struct Packet2cd __m256d v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -250,6 +253,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h new file mode 100644 index 000000000..2820abb49 --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -0,0 +1,442 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX512_H +#define EIGEN_COMPLEX_AVX512_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet8cf +{ + EIGEN_STRONG_INLINE Packet8cf() {} + EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {} + __m512 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet8cf type; + typedef Packet4cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasReduxp = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + enum { + size = 8, + alignment=unpacket_traits::alignment + }; + typedef Packet4cf half; +}; + +template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) +{ + return Packet8cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) +{ + const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet8cf(_mm512_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) +{ + __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) +{ + return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploaddup((const double*)(const void*)from )) ); +} +template<> EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploadquad((const double*)(const void*)from )) ); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather, Packet8cf>(const std::complex* from, Index stride) +{ + return Packet8cf(_mm512_castpd_ps(pgather((const double*)(const void*)from, stride))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet8cf>(std::complex* to, const Packet8cf& from, Index stride) +{ + pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet8cf& a) +{ + return pfirst(Packet2cf(_mm512_castps512_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { + return Packet8cf(_mm512_castsi512_ps( + _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), + _mm512_castps_si512(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) +{ + return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)), + Packet4cf(_mm512_extractf32x8_ps(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) +{ + return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)), + Packet4cf(_mm512_extractf32x8_ps(a.v, 1)))); +} + +template <> +EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { + __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1); + __m256 res = _mm256_add_ps(lane0, lane1); + return Packet4cf(res); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) + +template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) +{ + Packet8cf num = pmul(a, pconj(b)); + __m512 tmp = _mm512_mul_ps(b.v, b.v); + __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); + __m512 denom = _mm512_add_ps(tmp, tmp2); + return Packet8cf(_mm512_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) +{ + return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet4cd +{ + EIGEN_STRONG_INLINE Packet4cd() {} + EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {} + __m512d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cd type; + typedef Packet2cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasReduxp = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + enum { + size = 4, + alignment = unpacket_traits::alignment + }; + typedef Packet2cd half; +}; + +template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd psub(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) +{ + const __m512d mask = _mm512_castsi512_pd( + _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, + 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet4cd(_mm512_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) +{ + __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0); + __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF); + __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55); + __m512d odd = _mm512_mul_pd(tmp2, tmp3); + return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) +{ + #ifdef EIGEN_VECTORIZE_AVX512DQ + return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); + #else + return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { + return Packet4cd(_mm512_insertf64x4( + _mm512_castpd256_pd512(ploaddup(from).v), ploaddup(from+1).v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) +{ + return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( + _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+0*stride).v), pload(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+2*stride).v), pload(from+3*stride).v,1), 1)); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) +{ + __m512i fromi = _mm512_castpd_si512(from.v); + double* tod = (double*)(void*)to; + _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) +{ + __m128d low = _mm512_extractf64x2_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { + return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) +{ + return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) +{ + return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) + +template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) +{ + Packet4cd num = pmul(a, pconj(b)); + __m512d tmp = _mm512_mul_pd(b.v, b.v); + __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); + return Packet4cd(_mm512_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) +{ + return Packet4cd(_mm512_permute_pd(x.v,0x55)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranspose(reinterpret_cast&>(kernel)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranspose(reinterpret_cast&>(kernel)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1] + __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3] + __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1] + __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3] + + kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3] + kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2] + kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1] + kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0] +} + +template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex b) +{ + Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0)); + tmp = pinsertfirst(tmp, b); + return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) ); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex b) +{ + return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1(b).v), 0) )); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex b) +{ + Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) ); + tmp = pinsertlast(tmp, b); + return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) ); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex b) +{ + return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1(b).v), 3) )); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX512_H diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 1d38fb758..ce453f019 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -417,6 +417,7 @@ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { } #ifdef EIGEN_VECTORIZE_AVX512DQ +// FIXME: this does not look optimal, better load a Packet4d and shuffle... // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 0e8e0d2b3..b073fc8d4 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -61,20 +61,22 @@ template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p)) + #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1)))) #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9ca865bd1..9475a6ecc 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -506,13 +506,28 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b,dest, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const + { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const + { + eigen_internal_assert(RhsPacketSize<=8); dest = pset1(*b); } @@ -521,9 +536,10 @@ public: dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); + dest = ploadu(a); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -536,12 +552,14 @@ public: // pbroadcast2(b, b0, b1); // } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -556,13 +574,14 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -581,13 +600,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_half_dowto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {real(b[0]), real(b[0]), real(b[1]), real(b[1])}; + RealScalar i[4] = {imag(b[0]), imag(b[0]), imag(b[1]), imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -611,10 +674,10 @@ public: ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + RealPacketSize = Vectorizable ? packet_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -626,7 +689,7 @@ public: typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; @@ -643,16 +706,17 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const @@ -661,8 +725,7 @@ public: } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + loadQuadToDoublePacket(b,dest); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -696,12 +759,14 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -714,29 +779,30 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -789,9 +855,10 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -813,21 +880,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -843,13 +912,15 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; @@ -1649,7 +1720,7 @@ void gebp_kernel::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; @@ -1704,7 +1775,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1734,7 +1805,7 @@ void gebp_kernel p; - p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); } else { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 144083f1b..60c9dbc36 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -314,15 +314,18 @@ template void packetmath() ref[0] *= data1[i]; VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload(data1))) && "internal::predux_mul"); - for (int j=0; j(data1+j*PacketSize); + for (int j=0; j(data1+j*PacketSize); + } + internal::pstore(data2, internal::preduxp(packets)); + VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp"); } - internal::pstore(data2, internal::preduxp(packets)); - VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp"); for (int i=0; i Date: Thu, 6 Dec 2018 16:55:00 +0100 Subject: [PATCH 056/295] fix test regarding AVX512 vectorization of complexes. --- test/vectorization_logic.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index e2146eef3..4bf3b3db2 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -187,18 +187,19 @@ struct vectorization_logic VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector3(),Vector3()+Vector3(), - EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal), CompleteUnrolling)); + sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), - EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? SliceVectorizedTraversal : LinearTraversal), - ((!EIGEN_UNALIGNED_VECTORIZE) && HalfPacketSize==1) ? NoUnrolling : CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal) + : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal), + ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling)); VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - HalfPacketSize==1 ? InnerVectorizedTraversal : + sizeof(Scalar)==16 ? InnerVectorizedTraversal : EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : - LinearTraversal, + LinearTraversal, NoUnrolling)); VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling)); @@ -290,10 +291,6 @@ struct vectorization_logic_half typedef Matrix Matrix57; typedef Matrix Matrix35; typedef Matrix Matrix57u; -// typedef Matrix Matrix44; -// typedef Matrix Matrix44u; -// typedef Matrix Matrix44c; -// typedef Matrix Matrix44r; typedef Matrix(),Matrix()+Matrix(), - EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal, + sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), NoUnrolling)); VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), From 1d683ae2f5a340a6e2681c8cd0782f4db6b807ea Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 18:11:07 +0100 Subject: [PATCH 057/295] Fix compilation with avx512f only, i.e., no AVX512DQ --- Eigen/src/Core/arch/AVX512/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2820abb49..8750b07de 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -250,7 +250,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) const __m512d mask = _mm512_castsi512_pd( _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); - return Packet4cd(_mm512_xor_pd(a.v,mask)); + return Packet4cd(pxor(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) From cbf2f4b7a0da0719cfb4fac3e7fc9b2c6b5f52bd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 18:21:56 +0100 Subject: [PATCH 058/295] AVX512f includes FMA but GCC does not define __FMA__ with -mavx512f only --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- Eigen/src/Core/arch/AVX512/PacketMath.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 476de4fd4..0a1b63155 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,7 +22,7 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index ce453f019..cf30f783e 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -22,7 +22,7 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -223,7 +223,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, return _mm512_div_pd(a, b); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b073fc8d4..971a31a4d 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,7 +22,7 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif From 4e7746fe2234ccaa10cb0c9431d68ee72a1d1d40 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 09:15:46 +0100 Subject: [PATCH 059/295] bug #1636: fix gemm performance issue with gcc>=6 and no FMA --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9475a6ecc..88ca9cc97 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1387,10 +1387,18 @@ void gebp_kernel=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND asm("" : [a0] "+x" (A0), [a1] "+x" (A1) ); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif + #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ From ae59a7652bae0727a983d586395bcaf2c48385cc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 09:23:28 +0100 Subject: [PATCH 060/295] bug #1638: add a warning if avx512 is enabled without SSE/AVX FMA --- Eigen/src/Core/util/ConfigureVectorization.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 2ba0bab98..2584717b5 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -254,6 +254,13 @@ #define EIGEN_VECTORIZE_FMA #endif #if defined(__AVX512F__) + #ifndef __FMA__ + #if EIGEN_COMP_GNUC + #warning Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). + #else + #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). + #endif + #endif #define EIGEN_VECTORIZE_AVX512 #define EIGEN_VECTORIZE_AVX2 #define EIGEN_VECTORIZE_AVX From f233c6194db032f31c14d06b5e962197b66ea296 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 10:01:09 +0100 Subject: [PATCH 061/295] bug #1637: workaround register spilling in gebp with clang>=6.0+AVX+FMA --- Eigen/src/Core/arch/AVX/PacketMath.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 0a1b63155..35c821472 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -183,11 +183,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co #ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) - // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and gcc stupidly generates a vfmadd132ps instruction, - // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate - // the result of the product. +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) + // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and even register spilling with clang>=6.0 (bug 1637). + // Gcc stupidly generates a vfmadd132ps instruction. + // So let's enforce it to generate a vfmadd231ps instruction since the most common use + // case is to accumulate the result of the product. Packet8f res = c; __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; @@ -196,7 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); From 7b6d0ff1f6afdb7f6137c92a76d20e7497e2fa2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 15:14:50 +0100 Subject: [PATCH 062/295] Enable FMA with MSVC (through /arch:AVX2). To make this possible, I also has to turn the #warning regarding AVX512-FMA to a #error. --- CMakeLists.txt | 15 ++++++++++++++- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++-- Eigen/src/Core/util/ConfigureVectorization.h | 8 +++++--- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45cbc75ee..e18428e50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,7 +250,7 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma -DEIGEN_ENABLE_AVX512") if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") endif() @@ -350,6 +350,19 @@ else(NOT MSVC) endif(NOT CMAKE_CL_64) message(STATUS "Enabling SSE2 in tests/examples") endif(EIGEN_TEST_SSE2) + + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) + if(EIGEN_TEST_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") + message(STATUS "Enabling AVX in tests/examples") + endif() + + option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF) + if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + message(STATUS "Enabling FMA/AVX2 in tests/examples") + endif() + endif(NOT MSVC) option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 35c821472..9bc986575 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -181,7 +181,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 971a31a4d..b80b7f46a 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -266,7 +266,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif @@ -1013,7 +1013,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) } // Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { return ::fmaf(a,b,c); } diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 2584717b5..121476394 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -250,15 +250,17 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif - #ifdef __FMA__ + #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) + // MSVC does not expose a switch dedicated for FMA + // For MSVC, AVX2 => FMA #define EIGEN_VECTORIZE_FMA #endif #if defined(__AVX512F__) #ifndef __FMA__ #if EIGEN_COMP_GNUC - #warning Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). + #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). #else - #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). + #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). #endif #endif #define EIGEN_VECTORIZE_AVX512 From 956678a4eff96979ab2139e9ce14f98bb820aa9d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 18:03:36 +0100 Subject: [PATCH 063/295] bug #1515: disable gebp's 3pX4 micro kernel for MSVC<=19.14 because of register spilling. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 88ca9cc97..bcb332cb9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -370,10 +370,12 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) // we assume 16 registers // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, From efaf03bf96bdde4fc48ca7aacb1043bb5ae09b74 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 00:05:03 +0100 Subject: [PATCH 064/295] Fix noise in lu unit test --- test/lu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lu.cpp b/test/lu.cpp index effde6060..24bea784a 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -90,7 +90,7 @@ template void lu_non_invertible() VERIFY(!lu.isInjective()); VERIFY(!lu.isInvertible()); VERIFY(!lu.isSurjective()); - VERIFY((m1 * m1kernel).isMuchSmallerThan(m1)); + VERIFY_IS_MUCH_SMALLER_THAN((m1 * m1kernel), m1); VERIFY(m1image.fullPivLu().rank() == rank); VERIFY_IS_APPROX(m1 * m1.adjoint() * m1image, m1image); From cd25b538abff7370100ef5613b8138919f89c7b0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 00:13:37 +0100 Subject: [PATCH 065/295] Fix noise in sparse_basic_3 (numerical cancellation) --- test/sparse_basic.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 30d3f1bba..30bba3f07 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -658,7 +658,8 @@ void big_sparse_triplet(Index rows, Index cols, double density) { { Index r = internal::random(0,rows-1); Index c = internal::random(0,cols-1); - Scalar v = internal::random(); + // use positive values to prevent numerical cancellation errors in sum + Scalar v = numext::abs(internal::random()); triplets.push_back(TripletType(r,c,v)); sum += v; } From 426bce7529f148bbec3fd386ddf6d6c4880de347 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 09:44:21 +0100 Subject: [PATCH 066/295] fix EIGEN_GEBP_2PX4_SPILLING_WORKAROUND for non vectorized type, and non x86/64 target --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bcb332cb9..1197fec94 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1389,11 +1389,10 @@ void gebp_kernel=6 without FMA (bug 1637) #if EIGEN_GNUC_AT_LEAST(6,0) - #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND asm("" : [a0] "+x" (A0), [a1] "+x" (A1) ); + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif From 81c27325ae3b5b8cbc72762f74ecb7b82cd031f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 14:27:48 +0100 Subject: [PATCH 067/295] bug #1641: fix testing of pandnot and fix pandnot for complex on SSE/AVX/AVX512 --- Eigen/src/Core/arch/AVX/Complex.h | 4 ++-- Eigen/src/Core/arch/AVX512/Complex.h | 4 ++-- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- test/packetmath.cpp | 26 ++++++++++++++++++++------ 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 2bb40fc79..08d021b65 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,7 +72,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } @@ -279,7 +279,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 8750b07de..247f89860 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -78,7 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, con template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } @@ -265,7 +265,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, con template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index d075043ce..0f8960328 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -85,7 +85,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } @@ -308,7 +308,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 60c9dbc36..916b37bef 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -64,6 +64,10 @@ struct bit_andnot{ operator()(T a, T b) const { return a & (~b); } }; EIGEN_TEST_MAKE_BITWISE(andnot, bit_andnot()) +template +bool biteq(T a, T b) { + return (bits(a) == bits(b)).all(); +} } } @@ -92,7 +96,7 @@ template bool areApprox(const Scalar* a, const Scalar* b, int s { for (int i=0; i >(a,size) << "]" << " != vec: [" << Map >(b,size) << "]\n"; return false; @@ -344,11 +348,6 @@ template void packetmath() } } - CHECK_CWISE2_IF(true, internal::por, internal::por); - CHECK_CWISE2_IF(true, internal::pxor, internal::pxor); - CHECK_CWISE2_IF(true, internal::pand, internal::pand); - CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot); - if (PacketTraits::HasBlend) { Packet thenPacket = internal::pload(data1); Packet elsePacket = internal::pload(data2); @@ -384,6 +383,21 @@ template void packetmath() internal::pstore(data2, internal::pinsertlast(internal::pload(data1),s)); VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertlast"); } + + { + for (int i=0; i(); + unsigned char v = internal::random() ? 0xff : 0; + char* bytes = (char*)(data1+PacketSize+i); + for(int k=0; k void packetmath_real() From bff90bf270b330612e4d7e4fdba96b3671826208 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 18:58:28 +0100 Subject: [PATCH 068/295] workaround "may be used uninitialized" warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1197fec94..61521e2bb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1399,7 +1399,6 @@ void gebp_kernel Date: Sat, 8 Dec 2018 18:59:51 +0100 Subject: [PATCH 069/295] Enable "old" CMP0026 policy (not perfect, but better than dozens of warning) --- lapack/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 522ba8a2b..9b0d8638c 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -136,6 +136,7 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) add_subdirectory(testing/MATGEN) add_subdirectory(testing/LIN) add_subdirectory(testing/EIG) + cmake_policy(SET CMP0026 OLD) macro(add_lapack_test output input target) set(TEST_INPUT "${LAPACK_SOURCE_DIR}/testing/${input}") set(TEST_OUTPUT "${LAPACK_BINARY_DIR}/testing/${output}") From 450dc97c6b14cd738def377d8b04c12427c6449a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 9 Dec 2018 22:54:39 +0100 Subject: [PATCH 070/295] Various fixes in polynomial solver and its unit tests: - cleanup noise in imaginary part of real roots - take into account the magnitude of the derivative to check roots. - use <= instead of < at appropriate places --- unsupported/Eigen/src/Polynomials/Companion.h | 3 +- .../Eigen/src/Polynomials/PolynomialSolver.h | 30 ++++++++++++++++--- unsupported/test/polynomialsolver.cpp | 21 +++++++++++-- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h index 126be783b..6ab8f9714 100644 --- a/unsupported/Eigen/src/Polynomials/Companion.h +++ b/unsupported/Eigen/src/Polynomials/Companion.h @@ -75,8 +75,7 @@ class companion void setPolynomial( const VectorType& poly ) { const Index deg = poly.size()-1; - m_monic = Scalar(-1)/poly[deg] * poly.head(deg); - //m_bl_diag.setIdentity( deg-1 ); + m_monic = -poly.head(deg)/poly[deg]; m_bl_diag.setOnes(deg-1); } diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index 788594247..d4f737134 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -126,7 +126,7 @@ class PolynomialSolverBase for( Index i=0; i typedef typename internal::conditional::IsComplex, ComplexEigenSolver, EigenSolver >::type EigenSolverType; + typedef typename internal::conditional::IsComplex, Scalar, std::complex >::type ComplexScalar; public: /** Computes the complex roots of a new polynomial. */ @@ -354,6 +355,27 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> companion.balance(); m_eigenSolver.compute( companion.denseMatrix() ); m_roots = m_eigenSolver.eigenvalues(); + MatrixXcd A = companion.denseMatrix(); + // cleanup noise in imaginary part of real roots: + // if the imaginary part is rather small compared to the real part + // and that cancelling the imaginary part yield a smaller evaluation, + // then it's safe to keep the real part only. + RealScalar coarse_prec = std::pow(4,poly.size()+1)*NumTraits::epsilon(); + std::cout << coarse_prec << "\n"; + for(Index i = 0; i +PolynomialType polyder(const PolynomialType& p) +{ + typedef typename PolynomialType::Scalar Scalar; + PolynomialType res(p.size()); + for(Index i=1; i bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) @@ -44,10 +54,17 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) psolve.compute( pols ); const RootsType& roots( psolve.roots() ); EvalRootsType evr( deg ); + POLYNOMIAL pols_der = polyder(pols); + EvalRootsType der( deg ); for( int i=0; i() ); + // we need to divide by the magnitude of the derivative because + // with a high derivative is very small error in the value of the root + // yiels a very large error in the polynomial evaluation. + bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision() ); if( !evalToZero ) { cerr << "WRONG root: " << endl; From cf697272e1326e5ceecfd876ea67943b88425521 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 9 Dec 2018 23:05:46 +0100 Subject: [PATCH 071/295] Remove debug code. --- unsupported/Eigen/src/Polynomials/PolynomialSolver.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index d4f737134..5e0ecbb43 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -355,13 +355,11 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> companion.balance(); m_eigenSolver.compute( companion.denseMatrix() ); m_roots = m_eigenSolver.eigenvalues(); - MatrixXcd A = companion.denseMatrix(); // cleanup noise in imaginary part of real roots: // if the imaginary part is rather small compared to the real part // and that cancelling the imaginary part yield a smaller evaluation, // then it's safe to keep the real part only. - RealScalar coarse_prec = std::pow(4,poly.size()+1)*NumTraits::epsilon(); - std::cout << coarse_prec << "\n"; + RealScalar coarse_prec = RealScalar(std::pow(4,poly.size()+1))*NumTraits::epsilon(); for(Index i = 0; i Date: Mon, 10 Dec 2018 23:22:44 +0100 Subject: [PATCH 072/295] enable spilling workaround on architectures with SSE/AVX --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 61521e2bb..b1e98b6f9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1391,7 +1391,7 @@ void gebp_kernel=6 without FMA (bug 1637) - #if EIGEN_GNUC_AT_LEAST(6,0) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND From 7166496f7011e63ff90cbb8b1b41642aaa7dbcc3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 13:24:42 +0100 Subject: [PATCH 073/295] bug #1643: fix compilation issue with gcc and no optimizaion --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b1e98b6f9..3aaa68c4c 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1392,7 +1392,7 @@ void gebp_kernel=6 without FMA (bug 1637) #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) - #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif From 0a7e7af6fdd46aae6c56d1868f2cda5c9f4efa70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 15:33:17 +0100 Subject: [PATCH 074/295] Properly set the number of registers for AVX512 --- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 9bc986575..e5aeb6375 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -18,8 +18,8 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #ifdef EIGEN_VECTORIZE_FMA diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index cf30f783e..10284dd7c 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -19,7 +19,7 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif #ifdef EIGEN_VECTORIZE_FMA diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b80b7f46a..3e7a75bc0 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -18,7 +18,9 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +// 32 bits => 8 registers +// 64 bits => 16 registers #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif From f159cf3d750a7930a29abf172d9436550cc8369f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 15:36:27 +0100 Subject: [PATCH 075/295] Artificially increase l1-blocking size for AVX512. +10% speedup with current kernels. With a 6pX4 kernel (not committed yet), this provides a +20% speedup. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3aaa68c4c..968cec78b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -101,6 +101,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -372,7 +382,7 @@ public: default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) - // we assume 16 registers + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. // Bug 1515: MSVC prior to v19.14 yields to register spilling. From 37c91e18368e77a333afd2f5a1fd52026014fca5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 22:07:20 +0100 Subject: [PATCH 076/295] bug #1644: fix warning --- Eigen/src/LU/arch/Inverse_SSE.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index ebb64a62b..4dce2ef20 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -44,7 +44,7 @@ struct compute_inverse_size4 static void run(const MatrixType& mat, ResultType& result) { ActualMatrixType matrix(mat); - EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; + const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); // Load the full matrix into registers __m128 _L1 = matrix.template packet( 0); @@ -139,7 +139,7 @@ struct compute_inverse_size4 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); + rd = _mm_xor_ps(rd, p4f_sign_PNNP); // iB = C*|B| - D*B#*A iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); From 72c0bbe2bd1c49c75b6efdb81d0558f8b62578d1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 12 Dec 2018 15:48:36 +0100 Subject: [PATCH 077/295] Simplify handling of tests that must fail to compile. Each test is now a normal ctest target, and build properties (compiler+flags) are preserved (instead of starting a new build-dir from scratch). --- CMakeLists.txt | 5 +---- cmake/EigenTesting.cmake | 45 ++++++++++++++++++---------------------- failtest/CMakeLists.txt | 10 --------- test/CMakeLists.txt | 5 ----- 4 files changed, 21 insertions(+), 44 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e18428e50..5255e9600 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -519,10 +519,7 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -option(EIGEN_FAILTEST "Enable failtests." OFF) -if(EIGEN_FAILTEST) - add_subdirectory(failtest) -endif() +add_subdirectory(failtest) string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) if(cmake_generator_tolower MATCHES "makefile") diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 35deed509..8cb2d5492 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -334,37 +334,32 @@ endmacro(ei_add_test_sycl) # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON # so here we're just running CMake commands immediately, we're not adding any targets. macro(ei_add_failtest testname) - get_property(EIGEN_FAILTEST_FAILURE_COUNT GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT) - get_property(EIGEN_FAILTEST_COUNT GLOBAL PROPERTY EIGEN_FAILTEST_COUNT) - message(STATUS "Checking failtest: ${testname}") - set(filename "${testname}.cpp") - file(READ "${filename}" test_source) + set(test_target_ok ${testname}_ok) + set(test_target_ko ${testname}_ko) - try_compile(succeeds_when_it_should_fail - "${CMAKE_CURRENT_BINARY_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/${filename}" - COMPILE_DEFINITIONS "-DEIGEN_SHOULD_FAIL_TO_BUILD") - if (succeeds_when_it_should_fail) - message(STATUS "FAILED: ${testname} build succeeded when it should have failed") - endif() + # Add executables + add_executable(${test_target_ok} ${testname}.cpp) + add_executable(${test_target_ko} ${testname}.cpp) - try_compile(succeeds_when_it_should_succeed - "${CMAKE_CURRENT_BINARY_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/${filename}" - COMPILE_DEFINITIONS) - if (NOT succeeds_when_it_should_succeed) - message(STATUS "FAILED: ${testname} build failed when it should have succeeded") - endif() + # Remove them from the normal build process + set_target_properties(${test_target_ok} ${test_target_ko} PROPERTIES + EXCLUDE_FROM_ALL TRUE + EXCLUDE_FROM_DEFAULT_BUILD TRUE) - if (succeeds_when_it_should_fail OR NOT succeeds_when_it_should_succeed) - math(EXPR EIGEN_FAILTEST_FAILURE_COUNT ${EIGEN_FAILTEST_FAILURE_COUNT}+1) - endif() + # Configure the failing test + target_compile_definitions(${test_target_ko} PRIVATE EIGEN_SHOULD_FAIL_TO_BUILD) - math(EXPR EIGEN_FAILTEST_COUNT ${EIGEN_FAILTEST_COUNT}+1) + # Add the tests to ctest. + add_test(NAME ${test_target_ok} + COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $ + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_test(NAME ${test_target_ko} + COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $ + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) - set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT ${EIGEN_FAILTEST_FAILURE_COUNT}) - set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT ${EIGEN_FAILTEST_COUNT}) + # Expect the second test to fail + set_tests_properties(${test_target_ko} PROPERTIES WILL_FAIL TRUE) endmacro(ei_add_failtest) # print a summary of the different options diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt index 1a73f05e6..f95503d7e 100644 --- a/failtest/CMakeLists.txt +++ b/failtest/CMakeLists.txt @@ -1,4 +1,3 @@ -message(STATUS "Running the failtests") ei_add_failtest("failtest_sanity_check") @@ -64,12 +63,3 @@ ei_add_failtest("bdcsvd_int") ei_add_failtest("eigensolver_int") ei_add_failtest("eigensolver_cplx") -if (EIGEN_FAILTEST_FAILURE_COUNT) - message(FATAL_ERROR - "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. " - "To debug these failures, manually compile these programs in ${CMAKE_CURRENT_SOURCE_DIR}, " - "with and without #define EIGEN_SHOULD_FAIL_TO_BUILD.") -else() - message(STATUS "Failtest SUCCESS: all ${EIGEN_FAILTEST_COUNT} failtests passed.") - message(STATUS "") -endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f215d97cd..67b35a61e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -439,11 +439,6 @@ if (EIGEN_TEST_HIP) endif(EIGEN_TEST_HIP) - - -file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) -add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) - option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF) IF(EIGEN_TEST_BUILD_DOCUMENTATION) add_dependencies(buildtests doc) From 2de8da70fd0b35849845dc76b2741bb0689f0643 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 12 Dec 2018 17:30:08 +0100 Subject: [PATCH 078/295] bug #1557: fix RealSchur and EigenSolver for matrices with only zeros on the diagonal. --- Eigen/src/Eigenvalues/RealSchur.h | 15 +++++-- test/eigensolver_generic.cpp | 74 +++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index aca8a8279..8dbd9e314 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -236,7 +236,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu); + Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -307,12 +307,16 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Index totalIter = 0; // iteration count for whole matrix Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); + // sub-diagonal entries smaller than considerAsZero will be treated as zero. + // We use eps^2 to enable more precision in small eigenvalues. + Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), + (std::numeric_limits::min)() ); if(norm!=Scalar(0)) { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu); + Index il = findSmallSubdiagEntry(iu,considerAsZero); // Check for convergence if (il == iu) // One root found @@ -369,14 +373,17 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline Index RealSchur::findSmallSubdiagEntry(Index iu) +inline Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) + + s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); + + if (abs(m_matT.coeff(res,res-1)) <= s) break; res--; } diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index e0e435151..086ecdf5e 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -12,6 +12,21 @@ #include #include +template +void check_eigensolver_for_given_mat(const EigType &eig, const MatType& a) +{ + typedef typename NumTraits::Real RealScalar; + typedef Matrix RealVectorType; + typedef typename std::complex Complex; + Index n = a.rows(); + VERIFY_IS_EQUAL(eig.info(), Success); + VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()); + VERIFY_IS_APPROX(a.template cast() * eig.eigenvectors(), + eig.eigenvectors() * eig.eigenvalues().asDiagonal()); + VERIFY_IS_APPROX(eig.eigenvectors().colwise().norm(), RealVectorType::Ones(n).transpose()); + VERIFY_IS_APPROX(a.eigenvalues(), eig.eigenvalues()); +} + template void eigensolver(const MatrixType& m) { /* this test covers the following files: @@ -22,8 +37,7 @@ template void eigensolver(const MatrixType& m) typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef Matrix RealVectorType; - typedef typename std::complex::Real> Complex; + typedef typename std::complex Complex; MatrixType a = MatrixType::Random(rows,cols); MatrixType a1 = MatrixType::Random(rows,cols); @@ -36,12 +50,7 @@ template void eigensolver(const MatrixType& m) (ei0.pseudoEigenvectors().template cast()) * (ei0.eigenvalues().asDiagonal())); EigenSolver ei1(a); - VERIFY_IS_EQUAL(ei1.info(), Success); - VERIFY_IS_APPROX(a * ei1.pseudoEigenvectors(), ei1.pseudoEigenvectors() * ei1.pseudoEigenvalueMatrix()); - VERIFY_IS_APPROX(a.template cast() * ei1.eigenvectors(), - ei1.eigenvectors() * ei1.eigenvalues().asDiagonal()); - VERIFY_IS_APPROX(ei1.eigenvectors().colwise().norm(), RealVectorType::Ones(rows).transpose()); - VERIFY_IS_APPROX(a.eigenvalues(), ei1.eigenvalues()); + CALL_SUBTEST( check_eigensolver_for_given_mat(ei1,a) ); EigenSolver ei2; ei2.setMaxIterations(RealSchur::m_maxIterationsPerRow * rows).compute(a); @@ -100,6 +109,19 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors()); } + +template +Matrix +make_companion(const CoeffType& coeffs) +{ + Index n = coeffs.size()-1; + Matrix res(n,n); + res.setZero(); + res.row(0) = -coeffs.tail(n) / coeffs(0); + res.diagonal(-1).setOnes(); + return res; +} + template void eigensolver_generic_extra() { @@ -126,6 +148,42 @@ void eigensolver_generic_extra() VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.); VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.); } + + // regression test for bug 933 + { + { + VectorXd coeffs(5); coeffs << 1, -3, -175, -225, 2250; + MatrixXd C = make_companion(coeffs); + EigenSolver eig(C); + CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) ); + } + { + // this test is tricky because it requires high accuracy in smallest eigenvalues + VectorXd coeffs(5); coeffs << 6.154671e-15, -1.003870e-10, -9.819570e-01, 3.995715e+03, 2.211511e+08; + MatrixXd C = make_companion(coeffs); + EigenSolver eig(C); + CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) ); + Index n = C.rows(); + for(Index i=0;i Complex; + MatrixXcd ac = C.cast(); + ac.diagonal().array() -= eig.eigenvalues()(i); + VectorXd sv = ac.jacobiSvd().singularValues(); + // comparing to sv(0) is not enough here to catch the "bug", + // the hard-coded 1.0 is important! + VERIFY_IS_MUCH_SMALLER_THAN(sv(n-1), 1.0); + } + } + } + // regression test for bug 1557 + { + // this test is interesting because it contains zeros on the diagonal. + MatrixXd A_bug1557(3,3); + A_bug1557 << 0, 0, 0, 1, 0, 0.5887907064808635127, 0, 1, 0; + EigenSolver eig(A_bug1557); + CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1557) ); + } } EIGEN_DECLARE_TEST(eigensolver_generic) From cfc70dc13f5842d0a20c464427af6776f0a4031e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 12 Dec 2018 18:03:31 +0100 Subject: [PATCH 079/295] Add regression test for bug #1174 --- test/eigensolver_generic.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 086ecdf5e..cfb31e49e 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -184,6 +184,26 @@ void eigensolver_generic_extra() EigenSolver eig(A_bug1557); CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1557) ); } + + // regression test for bug 1174 + { + Index n = 12; + MatrixXf A_bug1174(n,n); + A_bug1174 << 262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432, + 262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432, + 262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432, + 262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0, + 0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0; + EigenSolver eig(A_bug1174); + CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1174) ); + } } EIGEN_DECLARE_TEST(eigensolver_generic) From f582ea357997cf9a18927f479de7e21fb96b3b5a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 12 Dec 2018 22:47:00 +0100 Subject: [PATCH 080/295] Fix compilation with expression template scalar type. --- Eigen/src/Eigenvalues/RealSchur.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 8dbd9e314..7304ef344 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -309,8 +309,8 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Scalar norm = computeNormOfT(); // sub-diagonal entries smaller than considerAsZero will be treated as zero. // We use eps^2 to enable more precision in small eigenvalues. - Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), - (std::numeric_limits::min)() ); + Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), + (std::numeric_limits::min)() ); if(norm!=Scalar(0)) { @@ -381,7 +381,7 @@ inline Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); + s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); if (abs(m_matT.coeff(res,res-1)) <= s) break; From dd6d65898a9826bb07556f2c788e6c0757d27603 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 12 Dec 2018 14:45:31 -0800 Subject: [PATCH 081/295] Fix shorten-64-to-32 warning. Use regular memcpy if num_threads==0. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 3b87b114d..e03735611 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -87,13 +87,13 @@ struct ThreadPoolDevice { const size_t kMinBlockSize = 32768; typedef TensorCostModel CostModel; const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); - if (n <= kMinBlockSize || num_threads == 1) { + if (n <= kMinBlockSize || num_threads < 2) { ::memcpy(dst, src, n); } else { const char* src_ptr = static_cast(src); char* dst_ptr = static_cast(dst); const size_t blocksize = (n + (num_threads - 1)) / num_threads; - Barrier barrier(num_threads - 1); + Barrier barrier(static_cast(num_threads - 1)); // Launch the last 3 blocks on worker threads. for (size_t i = 1; i < num_threads; ++i) { enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { From f20c991679a6860d9a91ced2f40b3c591da284a6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 13 Dec 2018 10:33:29 +0100 Subject: [PATCH 082/295] add changesets related to matrix product perf. --- bench/perf_monitoring/changesets.txt | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/bench/perf_monitoring/changesets.txt b/bench/perf_monitoring/changesets.txt index 960699c04..c970386c3 100644 --- a/bench/perf_monitoring/changesets.txt +++ b/bench/perf_monitoring/changesets.txt @@ -57,15 +57,31 @@ before-evaluators 8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. 9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators 9174:d228bc282ac9 # merge +9175:abc7a3600098 # Include the cost of stores in unrolling 9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 9361:69d418c06999 # 3.3-beta2 +9445:f27ff0ad77a3 # Optimize expression matching "d?=a-b*c" as "d?=a; d?=b*c;" 9583:bef509908b9d # 3.3-rc1 +9593:2f24280cf59a # Bug 1311: fix alignment logic in some cases of (scalar*small).lazyProduct(small) +9722:040d861b88b5 # Disabled part of the matrix matrix peeling code that's incompatible with 512 bit registers 9792:26667be4f70b # 3.3.0 +9891:41260bdfc23b # Fix a performance regression in (mat*mat)*vec for which mat*mat was evaluated multiple times. 9942:b1d3eba60130 # Operators += and -= do not resize! 9943:79bb9887afd4 # Ease compiler job to generate clean and efficient code in mat*vec 9946:2213991340ea # Complete rewrite of column-major-matrix * vector product to deliver higher performance of modern CPU. 9955:630471c3298c # Improve performance of row-major-dense-matrix * vector products for recent CPUs. (this is the next changeset fixing a typo) 9975:2eeed9de710c # Revert vec/y to vec*(1/y) in row-major TRSM - +10442:e3f17da72a40 # Bug 1435: fix aliasing issue in exressions like: A = C - B*A; +10735:6913f0cf7d06 # Adds missing EIGEN_STRONG_INLINE to support MSVC properly inlining small vector calculations +10943:4db388d946bd # Bug 1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x. +10961:5007ff66c9f6 # Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it. +11083:30a528a984bb # Bug 1578: Improve prefetching in matrix multiplication on MIPS. +11533:71609c41e9f8 # PR 526: Speed up multiplication of small, dynamically sized matrices +11535:6d348dc9b092 # Vectorize row-by-row gebp loop iterations on 16 packets as well +11568:efda481cbd7a # Bug 1624: improve matrix-matrix product on ARM 64, 20% speedup +11596:b8d3f548a9d9 # do not read buffers out of bounds +11628:22f9cc0079bd # Implement AVX512 vectorization of std::complex +11638:81172653b67b # Bug 1515: disable gebp's 3pX4 micro kernel for MSVC<=19.14 because of register spilling. +11659:b500fef42ced # Artificially increase l1-blocking size for AVX512. +10% speedup with current kernels. From efa4c9c40fbed9506aaef6a5393d27713f161984 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 13 Dec 2018 10:42:39 +0100 Subject: [PATCH 083/295] bug #1615: slightly increase the default unrolling limit to compensate for changeset 101ea26f5e18919972b321b5f7e3ef4e07be3fd6 . This solves a performance regression with clang and 3x3 matrix products. --- Eigen/src/Core/arch/Default/Settings.h | 2 +- doc/PreprocessorDirectives.dox | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h index 097373c84..a5c3ada4c 100644 --- a/Eigen/src/Core/arch/Default/Settings.h +++ b/Eigen/src/Core/arch/Default/Settings.h @@ -21,7 +21,7 @@ * it does not correspond to the number of iterations or the number of instructions */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 100 +#define EIGEN_UNROLLING_LIMIT 110 #endif /** Defines the threshold between a "small" and a "large" matrix. diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index 7e9e30c4b..ffd2c660c 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -117,7 +117,7 @@ run time. However, these assertions do cost time and can thus be turned off. Define it to 0 to disable. - \b \c EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not - correspond to the number of iterations or the number of instructions. The default is value 100. + correspond to the number of iterations or the number of instructions. The default is value 110. - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. From e763fcd09e620300226ca22d152b94867123b603 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Wed, 19 Dec 2018 14:24:44 -0800 Subject: [PATCH 084/295] Introducing "vectorized" byte on unpacket_traits structs This is a preparation to a change on gebp_traits, where a new template argument will be introduced to dictate the packet size, so it won't be bound to the current/max packet size only anymore. By having packet types defined early on gebp_traits, one has now to act on packet types, not scalars anymore, for the enum values defined on that class. One approach for reaching the vectorizable/size properties one needs there could be getting the packet's scalar again with unpacket_traits<>, then the size/Vectorizable enum entries from packet_traits<>. It turns out guards like "#ifndef EIGEN_VECTORIZE_AVX512" at AVX/PacketMath.h will hide smaller packet variations of packet_traits<> for some types (and it makes sense to keep that). In other words, one can't go back to the scalar and create a new PacketType, as this will always lead to the maximum packet type for the architecture. The less costly/invasive solution for that, thus, is to add the vectorizable info on every unpacket_traits struct as well. --- Eigen/src/Core/arch/AVX/Complex.h | 4 ++-- Eigen/src/Core/arch/AVX/PacketMath.h | 6 +++--- Eigen/src/Core/arch/AVX512/Complex.h | 6 ++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 6 +++--- Eigen/src/Core/arch/AltiVec/Complex.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 6 +++--- Eigen/src/Core/arch/GPU/PacketMath.h | 4 ++-- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 8 ++++---- Eigen/src/Core/arch/MSA/Complex.h | 4 ++-- Eigen/src/Core/arch/MSA/PacketMath.h | 6 +++--- Eigen/src/Core/arch/NEON/Complex.h | 4 ++-- Eigen/src/Core/arch/NEON/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SYCL/InteropHeaders.h | 2 +- Eigen/src/Core/arch/ZVector/Complex.h | 4 ++-- Eigen/src/Core/arch/ZVector/PacketMath.h | 6 +++--- Eigen/src/Core/util/XprHelper.h | 3 ++- 18 files changed, 46 insertions(+), 43 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 08d021b65..e7e2a1033 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -255,7 +255,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..e771c0f25 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -117,14 +117,14 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; - enum {size=8, alignment=Aligned32}; + enum {size=8, alignment=Aligned32, vectorizable=true}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=4, alignment=Aligned32}; + enum {size=4, alignment=Aligned32, vectorizable=true}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 247f89860..569ee01ff 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -50,7 +50,8 @@ template<> struct unpacket_traits { typedef std::complex type; enum { size = 8, - alignment=unpacket_traits::alignment + alignment=unpacket_traits::alignment, + vectorizable=true }; typedef Packet4cf half; }; @@ -237,7 +238,8 @@ template<> struct unpacket_traits { typedef std::complex type; enum { size = 4, - alignment = unpacket_traits::alignment + alignment = unpacket_traits::alignment, + vectorizable=true }; typedef Packet2cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 10284dd7c..9c3121062 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -102,19 +102,19 @@ struct unpacket_traits { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; + enum { size = 8, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=false }; }; template <> diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 3e665730c..5404a624e 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -286,7 +286,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d0ee93f4a..2c06003ed 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -192,13 +192,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) @@ -916,7 +916,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ddf37b9c1..eaba60e26 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -90,8 +90,8 @@ template<> struct packet_traits : default_packet_traits }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { return make_float4(from, from, from, from); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index f3d721dd7..cc5c484b6 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -41,7 +41,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { @@ -517,7 +517,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; }; template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { Packet16h result; @@ -984,7 +984,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; }; template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { Packet8h result; @@ -1329,7 +1329,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; }; template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { Packet4h result; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 9a45cf51e..fa64d3564 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -127,7 +127,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2cf half; }; @@ -500,7 +500,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 1, alignment = Aligned16 }; + enum { size = 1, alignment = Aligned16, vectorizable=true }; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index 094c874ee..a97156a84 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -117,14 +117,14 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef float type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4f half; }; template <> struct unpacket_traits { typedef int32_t type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4i half; }; @@ -925,7 +925,7 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef double type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2d half; }; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 306a309be..5e6de1f40 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -62,7 +62,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -328,7 +328,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ed3cec88a..ca4f2bf94 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,13 +145,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int32_t type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } @@ -650,7 +650,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 0f8960328..911fe066e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -50,7 +50,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -280,7 +280,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..4c7dc5b64 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -166,17 +166,17 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=2, alignment=Aligned16}; + enum {size=2, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; #ifndef EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index c1da40d14..294cb101a 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ template<> struct unpacket_traits {\ typedef unpacket_type type;\ - enum {size=lengths, alignment=Aligned16};\ + enum {size=lengths, alignment=Aligned16, vectorizable=true};\ typedef packet_type half;\ }; SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 95aba428f..167c3ee4c 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,8 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 0b37f4992..c8e90f1a8 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -239,9 +239,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 836ff4711..91c2e42e4 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -184,7 +184,8 @@ template struct unpacket_traits enum { size = 1, - alignment = 1 + alignment = 1, + vectorizable = false }; }; From 1024a70e82c0301d9f699fd344613e9cd417ab95 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Fri, 21 Dec 2018 11:03:18 -0800 Subject: [PATCH 085/295] =?UTF-8?q?gebp:=20Add=20new=20=C2=BD=20and=20?= =?UTF-8?q?=C2=BC=20packet=20rows=20per=20(peeling)=20round=20on=20the=20l?= =?UTF-8?q?hs=20MIME-Version:=201.0=20Content-Type:=20text/plain;=20charse?= =?UTF-8?q?t=3DUTF-8=20Content-Transfer-Encoding:=208bit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch works by altering the gebp lhs packing routines to also consider ½ and ¼ packet lenght rows when packing, besides the original whole package and row-by-row attempts. Finally, gebp itself will try to fit a fraction of a packet at a time if: i) ½ and/or ¼ packets are available for the current context (e.g. AVX2 and SSE-sized SIMD register for x86) ii) The matrix's height is favorable to it (it may be it's too small in that dimension to take full advantage of the current/maximum packet width or it may be the case that last rows may take advantage of smaller packets before gebp goes row-by-row) This helps mitigate huge slowdowns one had on AVX512 builds when compared to AVX2 ones, for some dimensions. Gains top at an extra 1x in throughput. This patch is a complement to changeset 4ad359237aeb519dbd4b55eba43057b37988838c . Since packing is changed, Eigen users which would go for very low-level API usage, like TensorFlow, will have to be adapted to work fine with the changes. --- .../Core/products/GeneralBlockPanelKernel.h | 689 ++++++++++++------ .../Core/products/SelfadjointMatrixMatrix.h | 23 +- 2 files changed, 468 insertions(+), 244 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 968cec78b..afbd83eda 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template +enum PacketSizeType { + PacketFull = 0, + PacketHalf, + PacketQuarter +}; + +template class gebp_traits; @@ -347,6 +353,43 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template +struct packet_conditional { typedef T3 type; }; + +template +struct packet_conditional { typedef T1 type; }; + +template +struct packet_conditional { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + ScalarPacket + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -357,7 +400,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -365,13 +408,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -395,9 +442,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; @@ -473,21 +517,25 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false, Arch> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -502,10 +550,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -672,8 +716,8 @@ template struct unpacket_traits > { // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs,Arch> +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -681,15 +725,21 @@ public: typedef std::complex RhsScalar; typedef std::complex ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - RealPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -699,8 +749,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type RealPacket; - typedef typename packet_traits::type ScalarPacket; typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket4Packing; @@ -824,8 +872,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs, Arch> +template +class gebp_traits, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -833,14 +881,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -851,10 +910,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -980,26 +1035,44 @@ struct gebp_traits template struct gebp_kernel { - typedef gebp_traits Traits; + typedef gebp_traits Traits; + typedef gebp_traits HalfTraits; + typedef gebp_traits QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - typedef gebp_traits SwappedTraits; + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -1010,11 +1083,11 @@ struct gebp_kernel }; template::LhsProgress> +int SwappedLhsProgress = gebp_traits::LhsProgress> struct last_row_process_16_packets { - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1042,8 +1115,8 @@ struct last_row_process_16_packets template struct last_row_process_16_packets { - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1089,6 +1162,195 @@ struct last_row_process_16_packets +struct lhs_process_one_packet +{ + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.template loadPacket(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template +struct lhs_process_fraction_of_packet : lhs_process_one_packet +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template EIGEN_DONT_INLINE void gebp_kernel @@ -1105,7 +1367,9 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); const Index prefetch_res_offset = 32/sizeof(ResScalar); @@ -1559,174 +1823,29 @@ void gebp_kernel=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i(alpha); - - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r1.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.template loadPacket(0 * Traits::ResPacketSize); - R1 = r3.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1 ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1928,9 +2053,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1989,20 +2117,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs1) + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) { - for(; i(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); } } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if(Pack21) + { + for(; i ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2031,37 +2205,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i::size }; + typedef typename unpacket_traits::type>::half HalfPacket; + typedef typename unpacket_traits::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i Date: Sat, 22 Dec 2018 13:09:07 +0100 Subject: [PATCH 086/295] Make code compile again for older compilers. See https://stackoverflow.com/questions/7411515/ --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 968cec78b..d6dd9dc17 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -589,7 +589,7 @@ public: template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper cj; + conj_helper cj; r = cj.pmadd(c,alpha,r); } @@ -927,7 +927,7 @@ public: template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper cj; + conj_helper cj; r = cj.pmadd(alpha,c,r); } From 5713fb7febf24140bfe748d8b868391f01828992 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 23 Dec 2018 15:40:52 +0100 Subject: [PATCH 087/295] Fix plog(+INF): it returned ~87 instead of +INF --- .../arch/Default/GenericPacketMathFunctions.h | 15 ++++++++++----- test/packetmath.cpp | 7 ++++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 9481850c6..83fed95de 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -54,6 +54,7 @@ Packet plog_float(const Packet _x) // The smallest non denormalized float number. const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); const Packet cst_minus_inf = pset1frombits( 0xff800000u); + const Packet cst_pos_inf = pset1frombits( 0x7f800000u); // Polynomial coefficients. const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); @@ -69,9 +70,6 @@ Packet plog_float(const Packet _x) const Packet cst_cephes_log_q1 = pset1(-2.12194440e-4f); const Packet cst_cephes_log_q2 = pset1(0.693359375f); - Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x)); - Packet iszero_mask = pcmp_eq(x,pzero(x)); - // Truncate input values to the minimum positive normal. x = pmax(x, cst_min_norm_pos); @@ -117,8 +115,15 @@ Packet plog_float(const Packet _x) x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask)); + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); } // Exponential function. Works by writing "x = m*log(2) + r" where diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 916b37bef..7e46b01de 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -520,10 +520,11 @@ template void packetmath_real() CHECK_CWISE1_IF(internal::packet_traits::HasErfc, std::erfc, internal::perfc); #endif - if(PacketTraits::HasLog && PacketSize>=2) + if(PacketSize>=2) { data1[0] = std::numeric_limits::quiet_NaN(); data1[1] = std::numeric_limits::epsilon(); + if(PacketTraits::HasLog) { packet_helper h; h.store(data2, internal::plog(h.load(data1))); @@ -551,6 +552,10 @@ template void packetmath_real() data1[0] = Scalar(-1.0f); h.store(data2, internal::plog(h.load(data1))); VERIFY((numext::isnan)(data2[0])); + + data1[0] = std::numeric_limits::infinity(); + h.store(data2, internal::plog(h.load(data1))); + VERIFY((numext::isinf)(data2[0])); } { packet_helper h; From 38d704def8b6799b14c319d6a67c671374daccc3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 23 Dec 2018 16:13:24 +0100 Subject: [PATCH 088/295] Make sure that psin/pcos return number in [-1,1] for large inputs (though sin/cos on large entries is quite useless because it's inaccurate) --- .../arch/Default/GenericPacketMathFunctions.h | 6 ++++++ test/packetmath.cpp | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 83fed95de..80bcc077d 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -289,9 +289,15 @@ Packet psincos_float(const Packet& _x) const Packet cst_coscof_p1 = pset1(-1.388731625493765E-003f); const Packet cst_coscof_p2 = pset1( 4.166664568298827E-002f); const Packet cst_cephes_FOPI = pset1( 1.27323954473516f); // 4 / M_PI + const Packet cst_sincos_max_arg = pset1( 13176795.0f); // Approx. (2**24) / (4/Pi). Packet x = pabs(_x); + // Prevent sin/cos from generating values larger than 1.0 in magnitude + // for very large arguments by setting x to 0.0. + Packet small_or_nan_mask = pcmp_lt_or_nan(x, cst_sincos_max_arg); + x = pand(x, small_or_nan_mask); + // Scale x by 4/Pi to find x's octant. Packet y = pmul(x, cst_cephes_FOPI); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 7e46b01de..bf0312a73 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -564,6 +564,22 @@ template void packetmath_real() VERIFY((numext::isnan)(data2[0])); VERIFY((numext::isnan)(data2[1])); } + if(PacketTraits::HasCos) + { + packet_helper h; + for(Scalar k = 1; k::epsilon(); k*=2) { + data1[0] = k*Scalar(EIGEN_PI) * internal::random(0.8,1.2); + data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random(0.8,1.2); + h.store(data2, internal::pcos(h.load(data1))); + VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); + VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + h.store(data2, internal::psin(h.load(data1))); + VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); + VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + } + } } } From 0f6f75bd8a0445edc3361659e065f15a29e2743c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 23 Dec 2018 17:26:21 +0100 Subject: [PATCH 089/295] Implement a faster fix for sin/cos of large entries that also correctly handle INF input. --- .../arch/Default/GenericPacketMathFunctions.h | 13 +++++++------ test/packetmath.cpp | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 80bcc077d..7ceaea894 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -289,15 +289,9 @@ Packet psincos_float(const Packet& _x) const Packet cst_coscof_p1 = pset1(-1.388731625493765E-003f); const Packet cst_coscof_p2 = pset1( 4.166664568298827E-002f); const Packet cst_cephes_FOPI = pset1( 1.27323954473516f); // 4 / M_PI - const Packet cst_sincos_max_arg = pset1( 13176795.0f); // Approx. (2**24) / (4/Pi). Packet x = pabs(_x); - // Prevent sin/cos from generating values larger than 1.0 in magnitude - // for very large arguments by setting x to 0.0. - Packet small_or_nan_mask = pcmp_lt_or_nan(x, cst_sincos_max_arg); - x = pand(x, small_or_nan_mask); - // Scale x by 4/Pi to find x's octant. Packet y = pmul(x, cst_cephes_FOPI); @@ -348,6 +342,13 @@ Packet psincos_float(const Packet& _x) y = ComputeSine ? pselect(poly_mask,y2,y1) : pselect(poly_mask,y1,y2); + // For very large arguments the the reduction to the [-Pi/4,+Pi/4] range + // does not work thus leading to sine/cosine out of the [-1:1] range. + // Since computing the sine/cosine for very large entry entries makes little + // sense in term of accuracy, we simply clamp to [-1,1]: + y = pmin(y,pset1( 1.f)); + y = pmax(y,pset1(-1.f)); + // Update the sign return pxor(y, sign_bit); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index bf0312a73..9f647530b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -579,6 +579,22 @@ template void packetmath_real() VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); } + + data1[0] = std::numeric_limits::infinity(); + data1[1] = -std::numeric_limits::infinity(); + h.store(data2, internal::psin(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY((numext::isnan)(data2[1])); + + h.store(data2, internal::pcos(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY((numext::isnan)(data2[1])); + + data1[0] = std::numeric_limits::quiet_NaN(); + h.store(data2, internal::psin(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + h.store(data2, internal::pcos(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); } } } From 961ff567e8679357068550ebc6a5b8cda004c319 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 23 Dec 2018 22:13:29 +0100 Subject: [PATCH 090/295] Add missing pcmp_lt_or_nan for AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 10284dd7c..3c673477f 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -262,6 +262,26 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, return _mm512_max_pd(b, a); } +#ifdef EIGEN_VECTORIZE_AVX512DQ +template Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I); } +Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } +#else +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +template Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I)); +} +Packet16f cat256(Packet8f a, Packet8f b) { + return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), + _mm256_castps_si256(b),1)); +} +#endif + +Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + __m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ); + __m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ); + return cat256(lo, hi); +} + template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); From 4aa667b510f06e2e209323e2419318b7bc1354db Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 24 Dec 2018 10:45:01 +0100 Subject: [PATCH 091/295] Add EIGEN_STRONG_INLINE where required --- Eigen/src/Core/arch/AVX512/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 3c673477f..13cb108eb 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -263,14 +263,14 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, } #ifdef EIGEN_VECTORIZE_AVX512DQ -template Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I); } -Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I); } +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } #else // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 -template Packet8f extract256(Packet16f x) { +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I)); } -Packet16f cat256(Packet8f a, Packet8f b) { +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b),1)); } From 60d3fe9a89f5eeaf1e118fdeb0c6281d63bf86a4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 24 Dec 2018 13:05:03 +0100 Subject: [PATCH 092/295] One more stupid AVX 512 fix (I don't have direct access to AVX512 machines) --- Eigen/src/Core/arch/AVX512/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 13cb108eb..b1cbef9f1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -276,7 +276,7 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { } #endif -Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { __m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ); __m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ); return cat256(lo, hi); From 697fba3bb0f09ecef77a23703df68956880ec7dd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Dec 2018 11:20:47 +0100 Subject: [PATCH 093/295] Fix unit test --- test/packetmath.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9f647530b..1158c4f9a 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -560,6 +560,7 @@ template void packetmath_real() { packet_helper h; data1[0] = Scalar(-1.0f); + data1[1] = -std::numeric_limits::denorm_min(); h.store(data2, internal::psqrt(h.load(data1))); VERIFY((numext::isnan)(data2[0])); VERIFY((numext::isnan)(data2[1])); From bc5dd4cafd5c4e29b6bf1cd3bf532bac407248bb Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Thu, 3 Jan 2019 14:33:04 +0100 Subject: [PATCH 094/295] PR560: Fix the AVX512f only builds Commit c53eececb0415834b961cb61cd466907261b4b2f introduced AVX512 support for complex numbers but required avx512dq to build. Commit 1d683ae2f5a340a6e2681c8cd0782f4db6b807ea fixed some but not, it would seem all, of the hard avx512dq dependencies. Build failures are still evident on Eigen and TensorFlow when compiling with just avx512f and no avx512dq using gcc 7.3. Looking at the code there does indeed seem to be a problem. Commit c53eececb0415834b961cb61cd466907261b4b2f calls avx512dq intrinsics directly, e.g, _mm512_extractf32x8_ps and _mm512_and_ps. This commit fixes the issue by replacing the direct intrinsic calls with the various wrapper functions that are safe to use on avx512f only builds. --- Eigen/src/Core/arch/AVX512/Complex.h | 32 ++++++++++++------------- Eigen/src/Core/arch/AVX512/PacketMath.h | 7 ++++++ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 247f89860..42cdfcd25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -66,7 +66,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet8cf(_mm512_xor_ps(a.v,mask)); + return Packet8cf(pxor(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) @@ -75,10 +75,10 @@ template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, con return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); } -template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } @@ -124,20 +124,20 @@ template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) { - return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)), - Packet4cf(_mm512_extractf32x8_ps(a.v,1)))); + return predux(padd(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) { - return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)), - Packet4cf(_mm512_extractf32x8_ps(a.v, 1)))); + return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); } template <> EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { - __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0); - __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1); + __m256 lane0 = extract256<0>(a.v); + __m256 lane1 = extract256<1>(a.v); __m256 res = _mm256_add_ps(lane0, lane1); return Packet4cf(res); } @@ -262,10 +262,10 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, con return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); } -template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } @@ -308,7 +308,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) { - __m128d low = _mm512_extractf64x2_pd(a.v, 0); + __m128d low = extract128<0>(a.v); EIGEN_ALIGN16 double res[2]; _mm_store_pd(res, low); return std::complex(res[0],res[1]); diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index b1cbef9f1..72b09d998 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -264,12 +264,19 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I); } +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I); } EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } #else // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I)); } + +// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I)); +} + EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b),1)); From 190d053e41ef8cb77e08e42a37b7e72f9c1d6d43 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 3 Jan 2019 14:55:28 -0800 Subject: [PATCH 095/295] Explicitly set fill character when printing aligned data to ostream --- Eigen/src/Core/IO.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index da7fd6cce..063511f24 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix + * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -53,9 +54,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="") + const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -71,6 +72,7 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; + char fill; int precision; int flags; }; @@ -176,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& width = std::max(width, Index(sstr.str().length())); } } + std::streamsize old_width = s.width(); + char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) s.width(width); + if(width) { + s.fill(fmt.fill); + s.width(width); + } s << m.coeff(i, 0); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if (width) s.width(width); + if(width) { + s.fill(fmt.fill); + s.width(width); + } s << m.coeff(i, j); } s << fmt.rowSuffix; @@ -196,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); + if(width) { + s.fill(old_fill_character); + s.width(old_width); + } return s; } From 055f0b73dbdd3b6e32ab10f8c3538b360124627f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 7 Jan 2019 16:53:36 -0800 Subject: [PATCH 096/295] Add support for pcmp_eq and pnot, including for complex types. --- Eigen/src/Core/GenericPacketMath.h | 23 ++++++++++++++++++----- Eigen/src/Core/arch/AVX/Complex.h | 14 ++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 1 + Eigen/src/Core/arch/AVX512/Complex.h | 18 ++++++++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 13 +++++++++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 14 ++++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 14 ++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 3 ++- test/packetmath.cpp | 9 +++++++++ 9 files changed, 103 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2b2ee9e2c..883c35d2c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,18 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + Packet ones = pset1(Scalar(1)); + return pandnot(ones, a); +} + /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int pshiftright(const int& a) { return a >> N; } @@ -258,7 +270,12 @@ pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_eq(const Packet& a, const Packet& b) +{ + typedef typename unpacket_traits::type Scalar; + Packet zeros = pset1(Scalar(0)); + return a==b ? pnot(zeros) : zeros; +} /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet @@ -272,10 +289,6 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } - /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 08d021b65..23687c624 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); + return Packet4cf(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -276,6 +283,13 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); + return Packet2cd(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..27c35fbd9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -228,6 +228,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 42cdfcd25..2c613f870 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -80,6 +80,15 @@ template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } +template <> +EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { + __m512 eq = pcmp_eq(a.v, b.v); + __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); + __m512i real_and_imag_equal = _mm512_and_si512( + _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); + return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); +} + template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } @@ -267,6 +276,15 @@ template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } +template <> +EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { + __m512d eq = pcmp_eq(a.v, b.v); + __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); + __m512i real_and_imag_equal = _mm512_and_si512( + _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); + return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); +} + template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 72b09d998..710351ed0 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -295,6 +295,19 @@ template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packe return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); } +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); +} template <> EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index f3d721dd7..3e35f96cc 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -655,6 +655,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; } +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1093,6 +1100,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; } +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 0f8960328..a7304193b 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -439,6 +439,20 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); + return Packet2cf(real_and_imag_equal); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); + return Packet1cd(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); return Packet2cf(_mm_castpd_ps(result)); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..71cf6b3bb 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -374,9 +374,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 1158c4f9a..3b700fdd9 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -238,6 +238,7 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul); CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); + CHECK_CWISE1(internal::pnot, internal::pnot); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); @@ -398,6 +399,14 @@ template void packetmath() CHECK_CWISE2_IF(true, internal::pand, internal::pand); CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot); } + + { + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data2[i] = (i % 2) ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq); + } } template void packetmath_real() From e70ffef9678f86ef465e93b89351e812ab47311d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 8 Jan 2019 16:26:31 -0800 Subject: [PATCH 097/295] Optimize evalShardedByInnerDim --- .../src/Tensor/TensorContractionThreadPool.h | 197 ++++++++++++++---- 1 file changed, 161 insertions(+), 36 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 3946e2fc4..9666bf167 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -756,6 +756,36 @@ struct TensorEvaluator + EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) const { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + // Decide whether we want to shard m x k x n contraction over the inner // (contraction) dimension (k). static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, @@ -788,50 +818,145 @@ struct TensorEvaluatorm_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; - const Index packet_size = internal::packet_traits::size; - const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + + // We will compute partial results into the buffers of this size. + const Index buffer_size_bytes = m * n * sizeof(Scalar); + // The underlying GEMM kernel assumes that k is a multiple of // the packet size and subtle breakage occurs if this is violated. - Index block_size = kmultiple * divup(k, kmultiple * num_threads); - Index num_blocks = divup(k, block_size); - // we use 'result' for the first block's partial result. - MaxSizeVector block_buffers(num_blocks - 1); - Barrier barrier(internal::convert_index(num_blocks)); - auto process_block = [=, &barrier](Scalar* buf, Index begin, Index end) { - ::memset(buf, 0, m * n * sizeof(Scalar)); + const Index packet_size = internal::packet_traits::size; + + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and + // gemm_pack_rhs, so we specify minimum desired block size. + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + const Index block_size = numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + const Index num_blocks = divup(k, block_size); + + // Compute block size with accounting for potentially incomplete last block. + const auto actual_block_size = [=](Index block_idx) -> Index { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks finihes, we need to add only these pre-aggregated blocks. + + // Compute range size with accounting for potentially incomplete last range. + const auto actual_range_size = [=](Index num_ranges, Index range_size, + Index range_idx) -> Index { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + const Index l0_ranges = divup(num_blocks, l0_size); + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state(l0_ranges); + for (int i = 0; i < l0_ranges; ++i) { + l0_state.emplace_back(actual_range_size(l0_ranges, l0_size, i)); + } + + MaxSizeVector block_buffers(num_blocks); + + auto process_block = [&, this](Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + ::memset(buf, 0, buffer_size_bytes); + TENSOR_CONTRACTION_DISPATCH( this->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, this->m_device.numThreads())); - barrier.Notify(); + (buf, begin, end, /*num_threads=*/num_blocks)); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } }; - Index start = 0; - for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) { - // The underlying GEMM kernel assumes that k is a multiple of packet size - // (currently largest packet size is 16) and subtle breakage occurs if - // this is violated. - block_size = kmultiple * divup(k - start, kmultiple * blocks_left); - Scalar* buf; - if (start == 0) { - buf = result; - } else { - buf = static_cast( - this->m_device.allocate(m * n * sizeof(Scalar))); - block_buffers.push_back(buf); - } - Index end = start + block_size; - if (end > k) { - end = k; - } - this->m_device.enqueueNoNotification( - [=, &process_block]() { process_block(buf, start, end); }); - start = end; + + Barrier barrier(internal::convert_index(num_blocks)); + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast( + this->m_device.allocate(buffer_size_bytes)); + block_buffers.push_back(buf); + + Index block_start = block_idx * block_size; + Index block_end = block_start + actual_block_size(block_idx); + + this->m_device.enqueueNoNotification([=, &barrier, &process_block]() { + process_block(block_idx, block_start, block_end); + barrier.Notify(); + }); } barrier.Wait(); - // Add other partial results into first partial result. - for (const auto& buf : block_buffers) { - addToBuffer(m * n, buf, result); - this->m_device.deallocate(buf); + // Aggregate partial sums from l0 ranges. + Index l0_index = 1; + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */block_buffers[0]); + } + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + + // Don't forget to deallocate ALL temporary buffers. + for (Index i = 1; i < num_blocks; ++i) { + this->m_device.deallocate(block_buffers[i]); } // Finally call output kernel with finalized output buffer. From e6b217b8ddf533de9bacc46aae2db6de78581056 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:25:17 +0100 Subject: [PATCH 098/295] bug #1652: implements a much more accurate version of vectorized sin/cos. This new version achieve same speed for SSE/AVX, and is slightly faster with FMA. Guarantees are as follows: - no FMA: 1ULP up to 3pi, 2ULP up to sin(25966) and cos(18838), fallback to std::sin/cos for larger inputs - FMA: 1ULP up to sin(117435.992) and cos(71476.0625), fallback to std::sin/cos for larger inputs --- Eigen/src/Core/GenericPacketMath.h | 27 +++- Eigen/src/Core/arch/AVX/PacketMath.h | 10 ++ .../arch/Default/GenericPacketMathFunctions.h | 152 +++++++++++------- Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++ test/packetmath.cpp | 56 +++++-- 5 files changed, 181 insertions(+), 75 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2b2ee9e2c..9fdd4a2ed 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -393,18 +393,39 @@ typename conditional<(unpacket_traits::size%8)==0,typename unpacket_trai predux_half_dowto4(const Packet& a) { return a; } -/** \internal \returns the product of the elements of \a a*/ +/** \internal \returns the product of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) { return a; } -/** \internal \returns the min of the elements of \a a*/ +/** \internal \returns the min of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) { return a; } -/** \internal \returns the max of the elements of \a a*/ +/** \internal \returns the max of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) { return a; } +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + return bool(predux(a)); +} + /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..ebea63757 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -575,6 +575,16 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) +{ + return _mm256_movemask_ps(x)!=0; +} template struct palign_impl diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 7ceaea894..3c167247e 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -269,88 +269,118 @@ EIGEN_UNUSED Packet psincos_float(const Packet& _x) { typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_1 = pset1(1.0f); - const Packet cst_half = pset1(0.5f); - const PacketI csti_1 = pset1(1); - const PacketI csti_not1 = pset1(~1); - const PacketI csti_2 = pset1(2); - const PacketI csti_3 = pset1(3); - - const Packet cst_sign_mask = pset1frombits(0x80000000u); - - const Packet cst_minus_cephes_DP1 = pset1(-0.78515625f); - const Packet cst_minus_cephes_DP2 = pset1(-2.4187564849853515625e-4f); - const Packet cst_minus_cephes_DP3 = pset1(-3.77489497744594108e-8f); - const Packet cst_sincof_p0 = pset1(-1.9515295891E-4f); - const Packet cst_sincof_p1 = pset1( 8.3321608736E-3f); - const Packet cst_sincof_p2 = pset1(-1.6666654611E-1f); - const Packet cst_coscof_p0 = pset1( 2.443315711809948E-005f); - const Packet cst_coscof_p1 = pset1(-1.388731625493765E-003f); - const Packet cst_coscof_p2 = pset1( 4.166664568298827E-002f); - const Packet cst_cephes_FOPI = pset1( 1.27323954473516f); // 4 / M_PI + const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1(1); + const Packet cst_sign_mask = pset1frombits(0x80000000u); Packet x = pabs(_x); - // Scale x by 4/Pi to find x's octant. - Packet y = pmul(x, cst_cephes_FOPI); + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); - // Get the octant. We'll reduce x by this number of octants or by one more than it. - PacketI y_int = pcast(y); - // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. - // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. - // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). - PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...) - y = pcast(y_int1); + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi // Compute the sign to apply to the polynomial. - // sign = third_bit(y_int1) xor signbit(_x) - Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<29>(y_int1))) - : preinterpret(pshiftleft<29>(padd(y_int1,csti_3))); + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<30>(y_int))) + : preinterpret(pshiftleft<30>(padd(y_int,csti_1))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit - // Get the polynomial selection mask from the second bit of y_int1 + // Get the polynomial selection mask from the second bit of y_int // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet poly_mask = preinterpret(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1))); + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. - // The magic pass: "Extended precision modular arithmetic" - // x = ((x - y * DP1) - y * DP2) - y * DP3 - x = pmadd(y, cst_minus_cephes_DP1, x); - x = pmadd(y, cst_minus_cephes_DP2, x); - x = pmadd(y, cst_minus_cephes_DP3, x); + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1(-1.57079601287841796875f), x); + x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1(-3.140625/2.), x); + // x = pmadd(y, pset1(-0.00048351287841796875), x); + // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + Packet huge_mask = pcmp_le(pset1(huge_th),pabs(_x)); + Packet huge_vals; + if(predux_any(huge_mask)) + { + const int PacketSize = unpacket_traits::size; + #if EIGEN_HAS_CXX11 + alignas(Packet) float vals[PacketSize]; + #else + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + #endif + pstoreu(vals, _x); + for(int k=0; k=huge_th) { + vals[k] = ComputeSine ? std::sin(val) : std::cos(val); + } + } + huge_vals = ploadu(vals); + } Packet x2 = pmul(x,x); - // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4) - Packet y1 = cst_coscof_p0; - y1 = pmadd(y1, x2, cst_coscof_p1); - y1 = pmadd(y1, x2, cst_coscof_p2); - y1 = pmul(y1, x2); - y1 = pmul(y1, x2); - y1 = psub(y1, pmul(x2, cst_half)); - y1 = padd(y1, cst_1); + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1(-0.5f)); + y1 = pmadd(y1, x2, pset1(1.f)); - // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) - Packet y2 = cst_sincof_p0; - y2 = pmadd(y2, x2, cst_sincof_p1); - y2 = pmadd(y2, x2, cst_sincof_p2); + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); y2 = pmul(y2, x2); y2 = pmadd(y2, x, x); - // Select the correct result from the two polynoms. + // Select the correct result from the two polynomials. y = ComputeSine ? pselect(poly_mask,y2,y1) : pselect(poly_mask,y1,y2); - // For very large arguments the the reduction to the [-Pi/4,+Pi/4] range - // does not work thus leading to sine/cosine out of the [-1:1] range. - // Since computing the sine/cosine for very large entry entries makes little - // sense in term of accuracy, we simply clamp to [-1,1]: - y = pmin(y,pset1( 1.f)); - y = pmax(y,pset1(-1.f)); - - // Update the sign - return pxor(y, sign_bit); + // Update the sign and filter huge inputs + return pselect(huge_mask, huge_vals, pxor(y, sign_bit)); } template diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..0003be43b 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -812,6 +812,17 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) +// { +// return _mm_movemask_ps(x) == 0xF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return _mm_movemask_ps(x) != 0x0; +} + #if EIGEN_COMP_GNUC // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 1158c4f9a..22a24039a 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -568,17 +568,22 @@ template void packetmath_real() if(PacketTraits::HasCos) { packet_helper h; - for(Scalar k = 1; k::epsilon(); k*=2) { - data1[0] = k*Scalar(EIGEN_PI) * internal::random(0.8,1.2); - data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random(0.8,1.2); - h.store(data2, internal::pcos(h.load(data1))); - VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); - VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); - data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); - data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); - h.store(data2, internal::psin(h.load(data1))); - VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); - VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + for(Scalar k = 1; k::epsilon(); k*=2) + { + for(int k1=0;k1<=1; ++k1) + { + data1[0] = (2*k+k1 )*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + data1[1] = (2*k+2+k1)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + h.store(data2, internal::pcos(h.load(data1))); + h.store(data2+PacketSize, internal::psin(h.load(data1))); + VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); + VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + VERIFY(data2[PacketSize+0]<=Scalar(1.) && data2[PacketSize+0]>=Scalar(-1.)); + VERIFY(data2[PacketSize+1]<=Scalar(1.) && data2[PacketSize+1]>=Scalar(-1.)); + + VERIFY_IS_APPROX(numext::abs2(data2[0])+numext::abs2(data2[PacketSize+0]), Scalar(1)); + VERIFY_IS_APPROX(numext::abs2(data2[1])+numext::abs2(data2[PacketSize+1]), Scalar(1)); + } } data1[0] = std::numeric_limits::infinity(); @@ -596,6 +601,12 @@ template void packetmath_real() VERIFY((numext::isnan)(data2[0])); h.store(data2, internal::pcos(h.load(data1))); VERIFY((numext::isnan)(data2[0])); + + data1[0] = -Scalar(0.); + h.store(data2, internal::psin(h.load(data1))); + VERIFY( internal::biteq(data2[0], data1[0]) ); + h.store(data2, internal::pcos(h.load(data1))); + VERIFY_IS_EQUAL(data2[0], Scalar(1)); } } } @@ -633,6 +644,29 @@ template void packetmath_notcomplex() ref[i] = data1[0]+Scalar(i); internal::pstore(data2, internal::plset(data1[0])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); + + { + unsigned char* data1_bits = reinterpret_cast(data1); + // predux_all - not needed yet + // for (unsigned int i=0; i(data1)) && "internal::predux_all(1111)"); + // for(int k=0; k(data1))) && "internal::predux_all(0101)"); + // for (unsigned int i=0; i(data1))) && "internal::predux_any(0000)"); + for(int k=0; k(data1)) && "internal::predux_any(0101)"); + for (unsigned int i=0; i void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) From aeec68f77b61c2d9fb8323ee7951bff3458d5f3f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:36:41 +0100 Subject: [PATCH 099/295] Add missing pcmp_lt and others for AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 72b09d998..564eb97dc 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -283,9 +283,27 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { } #endif +template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ); - __m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ); + __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); return cat256(lo, hi); } From 3f14e0d19e44d882b21b7c6b2370a22d2b15c7b9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:45:21 +0100 Subject: [PATCH 100/295] fix warning --- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 3c167247e..8c6e4f5c7 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -332,8 +332,11 @@ Packet psincos_float(const Packet& _x) // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee #endif - Packet huge_mask = pcmp_le(pset1(huge_th),pabs(_x)); - Packet huge_vals; + // We use huge_vals as a temporary for abs(_x) to ensure huge_vals + // is fully initialized for the last pselect(). (prevent compiler warning) + Packet huge_vals = pabs(_x); + Packet huge_mask = pcmp_le(pset1(huge_th),huge_vals); + if(predux_any(huge_mask)) { const int PacketSize = unpacket_traits::size; From 47810cf5b7286b03084b6ec2fb488c2f3eeddbcc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 16:40:42 +0100 Subject: [PATCH 101/295] Add dedicated implementations of predux_any for AVX512, NEON, and Altivec/VSE --- Eigen/src/Core/arch/AVX512/PacketMath.h | 7 +++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 5 +++++ Eigen/src/Core/arch/NEON/PacketMath.h | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 564eb97dc..95eb9d42f 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -969,6 +969,13 @@ EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) +{ + Packet16i xi = _mm512_castps_si512(x); + __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); + return !_mm512_kortestz(tmp,tmp); +} + template struct palign_impl { static EIGEN_STRONG_INLINE void run(Packet16f& first, diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d0ee93f4a..9464264a8 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) return pfirst(res); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return vec_any_ne(x, pzero(x)); +} + template struct palign_impl { diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ed3cec88a..8c3637258 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) return vget_lane_s32(max, 0); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp,tmp),0); +} + // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 #define PALIGN_NEON(Offset,Type,Command) \ From 3492a1ca74552ebfc4e4ed368ebdf2597f9b8452 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 16:53:37 +0100 Subject: [PATCH 102/295] fix plog(+inf) with AVX512 --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index aac707596..c2158c538 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -47,6 +47,7 @@ plog(const Packet16f& _x) { // The smallest non denormalized float number. _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); // Polynomial coefficients. @@ -116,10 +117,16 @@ plog(const Packet16f& _x) { x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN, + // - 0 will be -INF. + // - +INF will be +INF return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), - p16f_minus_inf); + _mm512_mask_blend_ps(invalid_mask, + _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), + p16f_nan), + p16f_minus_inf); } #endif From d812f411c3f99e93a774b80ed3772603303c6c59 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 18:00:05 +0100 Subject: [PATCH 103/295] bug #1654: fix compilation with cuda and no c++11 --- Eigen/src/Core/util/StaticAssert.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 9 +++++---- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_gpu.cu | 8 +++++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index b2f95153e..67714e444 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -104,7 +104,8 @@ STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, - INVALID_TEMPLATE_PARAMETER=1 + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1 }; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index 056665749..5d19652e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -1219,9 +1219,6 @@ template, GpuDevice> : public TensorContractionEvaluatorBase, GpuDevice> > { - static_assert(std::is_same::value, - "GPU tensor contraction does not support output kernels."); - typedef GpuDevice Device; typedef TensorEvaluator, Device> Self; @@ -1274,7 +1271,11 @@ struct TensorEvaluator::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } // We need to redefine this method to make nvcc happy EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cda658e0e..e8e1dc832 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -258,7 +258,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") endif() - if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) + if(( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) AND EIGEN_TEST_CXX11) set(EIGEN_CUDA_CXX11_FLAG "-std=c++11") else() # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11) diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 14fc0bd04..94625e6a3 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -17,6 +17,8 @@ #include +#define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11 + using Eigen::Tensor; void test_gpu_nullary() { @@ -617,6 +619,7 @@ void test_gpu_convolution_3d() } +#if EIGEN_GPU_TEST_C99_MATH template void test_gpu_lgamma(const Scalar stddev) { @@ -655,6 +658,7 @@ void test_gpu_lgamma(const Scalar stddev) gpuFree(d_in); gpuFree(d_out); } +#endif template void test_gpu_digamma() @@ -986,6 +990,7 @@ void test_gpu_igammac() gpuFree(d_out); } +#if EIGEN_GPU_TEST_C99_MATH template void test_gpu_erf(const Scalar stddev) { @@ -1063,6 +1068,7 @@ void test_gpu_erfc(const Scalar stddev) gpuFree(d_in); gpuFree(d_out); } +#endif template void test_gpu_betainc() @@ -1494,7 +1500,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_gpu) CALL_SUBTEST_3(test_gpu_convolution_3d()); #endif -#if __cplusplus > 199711L +#if EIGEN_GPU_TEST_C99_MATH // std::erf, std::erfc, and so on where only added in c++11. We use them // as a golden reference to validate the results produced by Eigen. Therefore // we can only run these tests if we use a c++11 compiler. From cb955df9a6fd5cb2673a7a15172609ce2dafdde8 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:17:08 -0800 Subject: [PATCH 104/295] Add packet up "pones". Write pnot(a) as pxor(pones(a), a). --- Eigen/src/Core/GenericPacketMath.h | 47 ++++++++++-------------- Eigen/src/Core/arch/AVX/Complex.h | 4 ++ Eigen/src/Core/arch/AVX/PacketMath.h | 19 ++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 26 +++++++++---- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 ++++++++ Eigen/src/Core/arch/SSE/Complex.h | 5 +++ Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++++++ test/packetmath.cpp | 2 + 8 files changed, 95 insertions(+), 35 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..777c74f57 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + +/** \internal \returns zeros */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pzero(const Packet& a) { return pxor(a,a); } + +/** \internal \returns ones */ +template EIGEN_DEVICE_FUNC inline Packet +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -250,36 +254,25 @@ pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } template EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); } -/** \internal \returns zeros */ -template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& a) { return pxor(a,a); } - /** \internal \returns bits of \a or \b according to the input bit mask \a mask */ template EIGEN_DEVICE_FUNC inline Packet -pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} +pselect(const Packet& mask, const Packet& a, const Packet& b) { return por(pand(a,mask),pandnot(b,mask)); } /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,9 +282,9 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with constant coefficients set from bits */ -template EIGEN_DEVICE_FUNC inline Packet -pset1frombits(BitsType a); +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..d880ef593 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -76,6 +76,8 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(real_and_imag_equal); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -290,6 +292,8 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(real_and_imag_equal); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..d258fd07b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -295,12 +295,6 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe return cat256(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); @@ -317,14 +311,30 @@ template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..8372cedfb 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); From f2767112c88762ddc62e8c066dd3377a3d89da31 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:29:18 -0800 Subject: [PATCH 105/295] Simplify a bit. --- Eigen/src/Core/arch/AVX/Complex.h | 6 ++---- Eigen/src/Core/arch/AVX512/Complex.h | 10 ++-------- Eigen/src/Core/arch/SSE/Complex.h | 6 ++---- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index d880ef593..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,8 +72,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } @@ -288,8 +287,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 8372cedfb..875cb09e0 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -447,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { From e00521b5149b8752c499b6b36df4ddce31246f43 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:32:53 -0800 Subject: [PATCH 106/295] Undo useless diffs. --- Eigen/src/Core/GenericPacketMath.h | 30 ++++++++++++++++-------------- Eigen/src/Core/arch/SSE/Complex.h | 2 +- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 777c74f57..7692bafac 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,18 +214,6 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients set from bits */ -template EIGEN_DEVICE_FUNC inline Packet -pset1frombits(BitsType a); - -/** \internal \returns zeros */ -template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& a) { return pxor(a,a); } - -/** \internal \returns ones */ -template EIGEN_DEVICE_FUNC inline Packet -pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} - /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) { return pxor(pones(a), a);} @@ -254,9 +242,19 @@ pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } template EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); } +/** \internal \returns zeros */ +template EIGEN_DEVICE_FUNC inline Packet +pzero(const Packet& a) { return pxor(a,a); } + +/** \internal \returns ones */ +template EIGEN_DEVICE_FUNC inline Packet +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} + /** \internal \returns bits of \a or \b according to the input bit mask \a mask */ template EIGEN_DEVICE_FUNC inline Packet -pselect(const Packet& mask, const Packet& a, const Packet& b) { return por(pand(a,mask),pandnot(b,mask)); } +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); +} /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet @@ -286,7 +284,11 @@ ploadu(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } -/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +/** \/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + +internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 875cb09e0..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,7 +82,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } From 1119c73d22dbf67404f097bc10a03b261c72f408 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: [PATCH 107/295] Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 33 ++++++++++-------------- Eigen/src/Core/arch/AVX/Complex.h | 10 ++++--- Eigen/src/Core/arch/AVX/PacketMath.h | 19 ++++++++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 ++----- Eigen/src/Core/arch/AVX512/PacketMath.h | 26 +++++++++++++------ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 ++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 +++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++++++++ test/packetmath.cpp | 2 ++ 9 files changed, 95 insertions(+), 43 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..048fc5157 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,9 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } - /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -254,6 +246,10 @@ pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); template EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) { return pxor(a,a); } +/** \internal \returns ones */ +template EIGEN_DEVICE_FUNC inline Packet +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} + /** \internal \returns bits of \a or \b according to the input bit mask \a mask */ template EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) { @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..d258fd07b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -295,12 +295,6 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe return cat256(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); @@ -317,14 +311,30 @@ template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); From 8f178429b9d0517a5a63da6dd73adbd84a15b375 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: [PATCH 108/295] Collapsed revision * Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 8 ++++---- Eigen/src/Core/arch/AVX512/PacketMath.h | 23 +++++++++++------------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 048fc5157..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,10 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } +/** \internal \returns ones */ +template EIGEN_DEVICE_FUNC inline Packet +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} + /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) { return pxor(pones(a), a);} @@ -246,10 +250,6 @@ pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); template EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) { return pxor(a,a); } -/** \internal \returns ones */ -template EIGEN_DEVICE_FUNC inline Packet -pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} - /** \internal \returns bits of \a or \b according to the input bit mask \a mask */ template EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) { diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index d258fd07b..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,27 +284,26 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> From 8f044425263e876236030f62461507325edfdf44 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: [PATCH 109/295] Collapsed revision * Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 29 ++++++-------- Eigen/src/Core/arch/AVX/Complex.h | 10 +++-- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 +---- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 ++++++++++++++---------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 ++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 ++++-- Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++++++ test/packetmath.cpp | 2 + 9 files changed, 104 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); From f6ba6071c583ae45cb379603e5a57cf65f01f44a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: [PATCH 110/295] Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 7692bafac..048fc5157 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -284,11 +284,11 @@ ploadu(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } -/** \/** \internal \returns a packet with constant coefficients set from bits */ +/** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); -internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } From e15bb785adf756f3e48410ee681ca97ad5bb3e76 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: [PATCH 111/295] Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 29 ++++++-------- Eigen/src/Core/arch/AVX/Complex.h | 10 +++-- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 +---- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 ++++++++++++++---------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 ++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 ++++-- Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++++++ test/packetmath.cpp | 2 + 9 files changed, 104 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); From 0abe03764c697ed8da37ce4421dd1918aa7a9b5f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 10 Jan 2019 10:27:55 -0800 Subject: [PATCH 112/295] Fix shorten-64-to-32 warning in TensorContractionThreadPool --- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 9666bf167..d68409e26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -890,7 +890,8 @@ struct TensorEvaluatortemplate evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, /*num_threads=*/num_blocks)); + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); // Check if it was the last task in l0 range. const Index l0_index = block_idx / l0_size; From 0522460a0d01d4253183349a49144b5ad8ba2f9f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 11 Jan 2019 11:07:56 +0100 Subject: [PATCH 113/295] bug #1656: Enable failtests only if BUILD_TESTING is enabled --- CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5255e9600..48c0a6367 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -469,6 +469,8 @@ if(BUILD_TESTING) else() add_subdirectory(test EXCLUDE_FROM_ALL) endif() + + add_subdirectory(failtest) endif() if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -519,8 +521,6 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -add_subdirectory(failtest) - string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) if(cmake_generator_tolower MATCHES "makefile") message(STATUS "Some things you can do now:") @@ -537,8 +537,10 @@ if(cmake_generator_tolower MATCHES "makefile") message(STATUS " | Or:") message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") message(STATUS "make doc | Generate the API documentation, requires Doxygen & LaTeX") - message(STATUS "make check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + if(BUILD_TESTING) + message(STATUS "make check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + endif() message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)") message(STATUS "make uninstall| Removes files installed by make install") message(STATUS "--------------+--------------------------------------------------------------") From 3c9add6598cc35e5317788627dfa81f517e89e07 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Fri, 11 Jan 2019 14:02:09 +0100 Subject: [PATCH 114/295] Remove reinterpret_cast from AVX512 complex implementation The reinterpret_casts used in ptranspose(PacketBlock&) ptranspose(PacketBlock&) don't appear to be working correctly. They're used to convert the kernel parameters to PacketBlock& so that the complex number versions of ptranspose can be written using the existing double implementations. Unfortunately, they don't seem to work and are responsible for 9 unit test failures in the AVX512 build of tensorflow master. This commit fixes the issue by manually initialising PacketBlock variables with the contents of the kernel parameter before calling the double version of ptranspose, and then copying the resulting values back into the kernel parameter before returning. --- Eigen/src/Core/arch/AVX512/Complex.h | 32 ++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 42cdfcd25..9c4ee1235 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -390,12 +390,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - ptranspose(reinterpret_cast&>(kernel)); + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - ptranspose(reinterpret_cast&>(kernel)); + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); + pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); + pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); + pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); + kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); + kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); + kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); + kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); } EIGEN_DEVICE_FUNC inline void From 9005f0111f3b5f4d29939ee67a5b516b0585455f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 11 Jan 2019 17:10:54 +0100 Subject: [PATCH 115/295] Replace compiler's alignas/alignof extension by respective c++11 keywords when available. This also fix a compilation issue with gcc-4.7. --- .../arch/Default/GenericPacketMathFunctions.h | 4 -- Eigen/src/Core/util/ConfigureVectorization.h | 7 ++- Eigen/src/Core/util/Macros.h | 43 ++++++++++++++++--- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 8c6e4f5c7..9a902c82d 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -340,11 +340,7 @@ Packet psincos_float(const Packet& _x) if(predux_any(huge_mask)) { const int PacketSize = unpacket_traits::size; - #if EIGEN_HAS_CXX11 - alignas(Packet) float vals[PacketSize]; - #else EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; - #endif pstoreu(vals, _x); for(int k=0; k=11 && EIGEN_HAS_CXX11 && \ + ( __has_feature(cxx_alignas) \ + || EIGEN_HAS_CXX14 \ + || (EIGEN_COMP_MSVC >= 1800) \ + || (EIGEN_GNUC_AT_LEAST(4,8)) \ + || (EIGEN_COMP_CLANG>=305) \ + || (EIGEN_COMP_ICC>=1500) \ + || (EIGEN_COMP_PGI>=1500) \ + || (EIGEN_COMP_SUN>=0x5130)) +#define EIGEN_HAS_ALIGNAS 1 +#else +#define EIGEN_HAS_ALIGNAS 0 +#endif +#endif + // Does the compiler support type_traits? // - full support of type traits was added only to GCC 5.1.0. // - 20150626 corresponds to the last release of 4.x libstdc++ From df29511ac0486639e23fe65c7edafecc2d9f1579 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 10:36:36 -0800 Subject: [PATCH 116/295] Fix merge. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 85a32a18d..00e40d40b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,11 +143,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } -<<<<<<< working copy template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { -======= -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { ->>>>>>> merge rev half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; } @@ -652,13 +648,8 @@ template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; } -<<<<<<< working copy template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; -======= -template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { - Packet16h r; r.x = Packet8i(pones(a.x)); return r; ->>>>>>> merge rev } template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { @@ -1106,11 +1097,7 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } -<<<<<<< working copy template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { -======= -template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { ->>>>>>> merge rev Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; } From 1e6d15b55b67d6170d4548fa7f20acbb327814bf Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 11 Jan 2019 11:41:53 -0800 Subject: [PATCH 117/295] Fix shorten-64-to-32 warning in TensorContractionThreadPool --- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d68409e26..e06099957 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -879,7 +879,8 @@ struct TensorEvaluator> l0_state(l0_ranges); for (int i = 0; i < l0_ranges; ++i) { - l0_state.emplace_back(actual_range_size(l0_ranges, l0_size, i)); + const Index num_pending_tasks = actual_range_size(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); } MaxSizeVector block_buffers(num_blocks); From a49d01edbaf992516e4dfd821f27eacd18a8fd38 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 13:18:17 -0800 Subject: [PATCH 118/295] Fix warnings in ptrue for complex and half types. --- Eigen/src/Core/GenericPacketMath.h | 7 +++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 95c4e4027..bb3275fe8 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -218,6 +218,13 @@ pandnot(const Packet& a, const Packet& b) { return a & (~b); } template EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} +template +EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { + RealScalar b; + b = ptrue(b); + return std::complex(b, b); +} + /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) { return pxor(ptrue(a), a);} diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 00e40d40b..80d6d4de7 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half ptrue(const half& a) { + return __half_raw(0xffffu); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; From 89c4001d6f5136fd2702258e4fa754be31d682a1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 14:10:57 -0800 Subject: [PATCH 119/295] Fix warnings in ptrue for complex and half types. --- Eigen/src/Core/GenericPacketMath.h | 7 +++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 95c4e4027..bb3275fe8 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -218,6 +218,13 @@ pandnot(const Packet& a, const Packet& b) { return a & (~b); } template EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} +template +EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { + RealScalar b; + b = ptrue(b); + return std::complex(b, b); +} + /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) { return pxor(ptrue(a), a);} diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 00e40d40b..eab7be14c 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,10 +143,14 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half ptrue(const half& a) { + return __half_raw(0xffffu); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; -} +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { From 28ba1b2c3209ea44956011dd5efd8d2b49a6f263 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 17:45:37 -0800 Subject: [PATCH 120/295] Add support for inverse hyperbolic functions. Fix cost of division. --- Eigen/src/Core/GlobalFunctions.h | 5 ++ Eigen/src/Core/MatrixBase.h | 5 ++ Eigen/src/Core/functors/UnaryFunctors.h | 61 +++++++++++++++++++++++-- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 42 ++++++++++++++++- test/array_cwise.cpp | 5 ++ 5 files changed, 114 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 563df6e84..71377cee5 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -66,6 +66,11 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) +#if EIGEN_HAS_CXX11_MATH + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) +#endif EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 596cdd133..4744e5cc4 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -468,6 +468,11 @@ template class MatrixBase const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) +#if EIGEN_HAS_CXX11_MATH + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) +#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 0c2d2cfca..55994047e 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -548,6 +548,23 @@ struct functor_traits > { }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the atanh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atanh() + */ +template +struct scalar_atanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -567,6 +584,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the asinh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::asinh() + */ +template +struct scalar_asinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -586,6 +620,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the acosh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::acosh() + */ +template +struct scalar_acosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -598,9 +649,13 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1(Scalar(1)),a); } }; -template -struct functor_traits > -{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasDiv, + Cost = scalar_div_cost::value + }; +}; /** \internal * \brief Template functor to compute the square of a scalar diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index e928db467..2f99ee0b2 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -23,6 +23,11 @@ typedef CwiseUnaryOp, const Derived> AtanReturn typedef CwiseUnaryOp, const Derived> TanhReturnType; typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; +#if EIGEN_HAS_CXX11_MATH +typedef CwiseUnaryOp, const Derived> AtanhReturnType; +typedef CwiseUnaryOp, const Derived> AsinhReturnType; +typedef CwiseUnaryOp, const Derived> AcoshReturnType; +#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; @@ -327,7 +332,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa Math functions, tan(), sinh(), cosh() + * \sa Math functions, tanh(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -336,6 +341,41 @@ cosh() const return CoshReturnType(derived()); } +#if EIGEN_HAS_CXX11_MATH +/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AtanhReturnType +atanh() const +{ + return AtanhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AsinhReturnType +asinh() const +{ + return AsinhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AcoshReturnType +acosh() const +{ + return AcoshReturnType(derived()); +} +#endif + /** \returns an expression of the coefficient-wise logistic of *this. */ EIGEN_DEVICE_FUNC diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 84e46665b..fbc63a81d 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -231,6 +231,11 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); +#if EIGEN_HAS_CXX11_MATH + VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1))); + VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1))); + VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1))); +#endif VERIFY_IS_APPROX(m1.logistic(), logistic(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); From f566724023e1a82be7fecfe0639e908772d3cea6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 13 Jan 2019 17:54:30 +0100 Subject: [PATCH 121/295] Fix StorageIndex FIXME in dense LU solvers --- Eigen/src/LU/FullPivLU.h | 10 +++++----- Eigen/src/LU/PartialPivLU.h | 2 +- test/lu.cpp | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 344ec8926..b4f4bc6ee 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -64,7 +65,6 @@ template class FullPivLU typedef SolverBase Base; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -529,8 +529,8 @@ void FullPivLU::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index(i); } break; } @@ -541,8 +541,8 @@ void FullPivLU::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index bfcd2c95b..ecc0e748f 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -80,7 +81,6 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime diff --git a/test/lu.cpp b/test/lu.cpp index 24bea784a..46fd60555 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -18,6 +18,8 @@ typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { template void lu_non_invertible() { + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + typedef typename MatrixType::RealScalar RealScalar; /* this test covers the following files: LU.h @@ -191,6 +193,8 @@ template void lu_partial_piv() m1.setRandom(); PartialPivLU plu(m1); + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + VERIFY_IS_APPROX(m1, plu.reconstructedMatrix()); m3 = MatrixType::Random(size,size); From 4356a55a61c99faec681b20c5477b7e7012ca128 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 13:54:01 +0100 Subject: [PATCH 122/295] PR 571: Implements an accurate argument reduction algorithm for huge inputs of sin/cos and call it instead of falling back to std::sin/std::cos. This makes both the small and huge argument cases faster because: - for small inputs this removes the last pselect - for large inputs only the reduction part follows a scalar path, the rest use the same SIMD path as the small-argument case. --- .../arch/Default/GenericPacketMathFunctions.h | 119 +++++++++++++----- Eigen/src/Core/util/Meta.h | 33 +++++ 2 files changed, 119 insertions(+), 33 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 9a902c82d..ce3f0fc68 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -3,7 +3,7 @@ // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// Copyright (C) 2009-2018 Gael Guennebaud +// Copyright (C) 2009-2019 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -253,15 +253,68 @@ Packet pexp_double(const Packet _x) return pmax(pldexp(x,fx), _x); } -/* The code is the rewriting of the cephes sinf/cosf functions. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). +// The following code is inspired by the following stack-overflow answer: +// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 +// It has been largely optimized: +// - By-pass calls to frexp. +// - Aligned loads of required 96 bits of 2/pi. This is accomplished by +// (1) balancing the mantissa and exponent to the required bits of 2/pi are +// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. +// - Avoid a branch in rounding and extraction of the remaining fractional part. +// Overall, I measured a speed up higher than x2 on x86-64. +inline float trig_reduce_huge (float xf, int *quadrant) +{ + using Eigen::numext::int32_t; + using Eigen::numext::uint32_t; + using Eigen::numext::int64_t; + using Eigen::numext::uint64_t; - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ + const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + + // 192 bits of 2/pi for Payne-Hanek reduction + // Bits are introduced by packet of 8 to enable aligned reads. + static const uint32_t two_over_pi [] = + { + 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, + 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, + 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, + 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, + 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, + 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, + 0x10e41000, 0xe4100000 + }; + + uint32_t xi = numext::as_uint(xf); + // Below, -118 = -126 + 8. + // -126 is to get the exponent, + // +8 is to enable alignment of 2/pi's bits on 8 bits. + // This is possible because the fractional part of x as only 24 meaningful bits. + uint32_t e = (xi >> 23) - 118; + // Extract the mantissa and shift it to align it wrt the exponent + xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); + + uint32_t i = e >> 3; + uint32_t twoopi_1 = two_over_pi[i-1]; + uint32_t twoopi_2 = two_over_pi[i+3]; + uint32_t twoopi_3 = two_over_pi[i+7]; + + // Compute x * 2/pi in 2.62-bit fixed-point format. + uint64_t p; + p = uint64_t(xi) * twoopi_3; + p = uint64_t(xi) * twoopi_2 + (p >> 32); + p = (uint64_t(xi * twoopi_1) << 32) + p; + + // Round to nearest: add 0.5 and extract integral part. + uint64_t q = (p + zero_dot_five) >> 62; + *quadrant = int(q); + // Now it remains to compute "r = x - q*pi/2" with high accuracy, + // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: + // r = (p-q)*pi/2, + // where the product can be be carried out with sufficient accuracy using double precision. + p -= q<<62; + return float(double(int64_t(p)) * pio2_62); +} template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -285,17 +338,6 @@ Packet psincos_float(const Packet& _x) PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi - // Compute the sign to apply to the polynomial. - // sin: sign = second_bit(y_int) xor signbit(_x) - // cos: sign = second_bit(y_int+1) - Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<30>(y_int))) - : preinterpret(pshiftleft<30>(padd(y_int,csti_1))); - sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit - - // Get the polynomial selection mask from the second bit of y_int - // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) @@ -332,25 +374,36 @@ Packet psincos_float(const Packet& _x) // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee #endif - // We use huge_vals as a temporary for abs(_x) to ensure huge_vals - // is fully initialized for the last pselect(). (prevent compiler warning) - Packet huge_vals = pabs(_x); - Packet huge_mask = pcmp_le(pset1(huge_th),huge_vals); - - if(predux_any(huge_mask)) + if(predux_any(pcmp_le(pset1(huge_th),pabs(_x)))) { const int PacketSize = unpacket_traits::size; EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; - pstoreu(vals, _x); - for(int k=0; k=huge_th) { - vals[k] = ComputeSine ? std::sin(val) : std::cos(val); - } + if(val>=huge_th && (numext::isfinite)(val)) + x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); } - huge_vals = ploadu(vals); + x = ploadu(x_cpy); + y_int = ploadu(y_int2); } + // Compute the sign to apply to the polynomial. + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<30>(y_int))) + : preinterpret(pshiftleft<30>(padd(y_int,csti_1))); + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + + // Get the polynomial selection mask from the second bit of y_int + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); + Packet x2 = pmul(x,x); // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) @@ -379,7 +432,7 @@ Packet psincos_float(const Packet& _x) : pselect(poly_mask,y1,y2); // Update the sign and filter huge inputs - return pselect(huge_mask, huge_vals, pxor(y, sign_bit)); + return pxor(y, sign_bit); } template diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 1415b3fc1..8fcb18a94 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -636,8 +636,41 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } #endif +/** \internal extract the bits of the float \a x */ +inline unsigned int as_uint(float x) +{ + unsigned int ret; + std::memcpy(&ret, &x, sizeof(float)); + return ret; +} + } // end namespace numext } // end namespace Eigen +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 +#include +namespace Eigen { +namespace numext { +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provides the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} +#endif + #endif // EIGEN_META_H From 9d988a1e1a83c51422d96030fdad7267e4e946ee Mon Sep 17 00:00:00 2001 From: Greg Coombe Date: Fri, 11 Jan 2019 23:14:35 -0800 Subject: [PATCH 123/295] Initialize isometric transforms like affine transforms. The isometric transform, like the affine transform, has an implicit last row of [0, 0, 0, 1]. This was not being properly initialized, as verified by a new test function. --- Eigen/src/Geometry/Transform.h | 4 +-- test/geo_transformations.cpp | 61 +++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 75991aaed..4429a9738 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -252,11 +252,11 @@ protected: public: /** Default constructor without initialization of the meaningful coefficients. - * If Mode==Affine, then the last row is set to [0 ... 0 1] */ + * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */ EIGEN_DEVICE_FUNC inline Transform() { check_template_params(); - internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); + internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } EIGEN_DEVICE_FUNC inline Transform(const Transform& other) diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index bf920696b..25f1d9aa0 100755 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -612,6 +612,62 @@ template void transform_products() VERIFY_IS_APPROX((ac*p).matrix(), a_m*p_m); } +template void transformations_no_scale() +{ + /* this test covers the following files: + Cross.h Quaternion.h, Transform.h + */ + typedef Matrix Vector3; + typedef Matrix Vector4; + typedef Quaternion Quaternionx; + typedef AngleAxis AngleAxisx; + typedef Transform Transform3; + typedef Translation Translation3; + typedef Matrix Matrix4; + + Vector3 v0 = Vector3::Random(), + v1 = Vector3::Random(); + + Transform3 t0, t1, t2; + + Scalar a = internal::random(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)); + + Quaternionx q1, q2; + + q1 = AngleAxisx(a, v0.normalized()); + + t0 = Transform3::Identity(); + VERIFY_IS_APPROX(t0.matrix(), Transform3::MatrixType::Identity()); + + t0.setIdentity(); + t1.setIdentity(); + v1 = Vector3::Ones(); + t0.linear() = q1.toRotationMatrix(); + t0.pretranslate(v0); + t1.linear() = q1.conjugate().toRotationMatrix(); + t1.translate(-v0); + + VERIFY((t0 * t1).matrix().isIdentity(test_precision())); + + t1.fromPositionOrientationScale(v0, q1, v1); + VERIFY_IS_APPROX(t1.matrix(), t0.matrix()); + VERIFY_IS_APPROX(t1*v1, t0*v1); + + // translation * vector + t0.setIdentity(); + t0.translate(v0); + VERIFY_IS_APPROX((t0 * v1).template head<3>(), Translation3(v0) * v1); + + // Conversion to matrix. + Transform3 t3; + t3.linear() = q1.toRotationMatrix(); + t3.translation() = v1; + Matrix4 m3 = t3.matrix(); + VERIFY((m3 * m3.inverse()).isIdentity(test_precision())); + // Verify implicit last row is initialized. + VERIFY_IS_APPROX(Vector4(m3.row(3)), Vector4(0.0, 0.0, 0.0, 1.0)); +} + EIGEN_DECLARE_TEST(geo_transformations) { for(int i = 0; i < g_repeat; i++) { @@ -625,7 +681,7 @@ EIGEN_DECLARE_TEST(geo_transformations) CALL_SUBTEST_3(( transformations() )); CALL_SUBTEST_3(( transformations() )); CALL_SUBTEST_3(( transform_alignment() )); - + CALL_SUBTEST_4(( transformations() )); CALL_SUBTEST_4(( non_projective_only() )); @@ -641,5 +697,8 @@ EIGEN_DECLARE_TEST(geo_transformations) CALL_SUBTEST_8(( transform_associativity(Rotation2D(internal::random()*double(EIGEN_PI))) )); CALL_SUBTEST_8(( transform_associativity(Quaterniond::UnitRandom()) )); + + CALL_SUBTEST_9(( transformations_no_scale() )); + CALL_SUBTEST_9(( transformations_no_scale() )); } } From d4881751d3afe3e7b7efcf16f91e7237bba3e664 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 16:38:26 +0100 Subject: [PATCH 124/295] Doc: add Isometry in the list of supported Mode of Transform<> --- Eigen/src/Geometry/Transform.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 4429a9738..3670767aa 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -97,6 +97,9 @@ template struct transform_make_affine; * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. * - #Projective: the transformation is stored as a (Dim+1)^2 matrix * without any assumption. + * - #Isometry: same as #Affine with the additional assumption that + * the linear part represents a rotation. This assumption is exploited + * to speed up some functions such as inverse() and rotation(). * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * From ccddeaad904e0a4a344912deedddc7d5c2dfb623 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 16:51:16 +0100 Subject: [PATCH 125/295] fix warning --- Eigen/src/Core/GenericPacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index bb3275fe8..04a321b9f 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -216,7 +216,7 @@ pandnot(const Packet& a, const Packet& b) { return a & (~b); } /** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} +ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} template EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { From 61b6eb05fe150909a88273d205f392e1ec3307ff Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 17:28:47 +0100 Subject: [PATCH 126/295] AVX512 (r)sqrt(double) was mistakenly disabled with clang and others --- Eigen/src/Core/arch/AVX512/PacketMath.h | 4 ++-- test/packetmath.cpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 1164f24b1..4832f2a3b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -57,7 +57,7 @@ template<> struct packet_traits : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, #endif @@ -77,7 +77,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 04f93108f..4906f6eb0 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -568,6 +568,7 @@ template void packetmath_real() h.store(data2, internal::plog(h.load(data1))); VERIFY((numext::isinf)(data2[0])); } + if(PacketTraits::HasSqrt) { packet_helper h; data1[0] = Scalar(-1.0f); From 3c9e6d206d6ba0fe5146a634d594469b1d76780f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 17:57:28 +0100 Subject: [PATCH 127/295] AVX512: fix pgather/pscatter for Packet4cd and unaligned pointers --- Eigen/src/Core/arch/AVX512/Complex.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index f2034a713..7bb2fd630 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -308,18 +308,18 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex< template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) { return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( - _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+0*stride).v), pload(from+1*stride).v,1)), - _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+2*stride).v), pload(from+3*stride).v,1), 1)); + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+0*stride).v), ploadu(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+2*stride).v), ploadu(from+3*stride).v,1), 1)); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) { __m512i fromi = _mm512_castpd_si512(from.v); double* tod = (double*)(void*)to; - _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); - _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); - _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); - _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); + _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) From 250dcd1fdbd3c52a53e57cd8f5d5591fd1e61b56 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 14 Jan 2019 21:45:56 +0100 Subject: [PATCH 128/295] bug #1652: fix position of EIGEN_ALIGN16 attributes in Neon and Altivec --- Eigen/src/Core/arch/AltiVec/Complex.h | 12 ++++++------ Eigen/src/Core/arch/AltiVec/PacketMath.h | 18 +++++++++--------- Eigen/src/Core/arch/NEON/Complex.h | 4 ++-- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 3e665730c..23dd016f9 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore((float *)&res, a.v); return res[0]; @@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 9464264a8..6b347cedb 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -324,7 +324,7 @@ pbroadcast4(const int *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -333,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -342,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f } template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -351,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, co } template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -565,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -985,14 +985,14 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -1064,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 306a309be..62d107645 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -146,7 +146,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 x[2]; + EIGEN_ALIGN16 std::complex x[2]; vst1q_f32((float *)x, a.v); return x[0]; } @@ -401,7 +401,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); return res; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8c3637258..a8a7b63c9 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -375,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EI template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; From 2c5843dbbbcb5b925e06d75f58cc3bc09f19c3bb Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 14 Jan 2019 13:26:34 -0800 Subject: [PATCH 129/295] Update documentation. --- doc/CoeffwiseMathFunctionsTable.dox | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox index af1251226..e14eaf615 100644 --- a/doc/CoeffwiseMathFunctionsTable.dox +++ b/doc/CoeffwiseMathFunctionsTable.dox @@ -321,6 +321,43 @@ This also means that, unless specified, if the function \c std::foo is available + + + \anchor cwisetable_asinh + a.\link ArrayBase::asinh asinh\endlink(); \n + \link Eigen::asinh asinh\endlink(a); + + computes inverse hyperbolic sine + + using std::asinh; \n + asinh(a[i]); + + + + + \anchor cwisetable_acosh + a.\link ArrayBase::acosh cohs\endlink(); \n + \link Eigen::acosh acosh\endlink(a); + + computes hyperbolic cosine + + using std::acosh; \n + acosh(a[i]); + + + + + \anchor cwisetable_atanh + a.\link ArrayBase::atanh atanh\endlink(); \n + \link Eigen::atanh atanh\endlink(a); + + computes hyperbolic tangent + + using std::atanh; \n + atanh(a[i]); + + + Nearest integer floating point operations From e7d4d4f192fb77bf3bf4875b2e56dfbe7ca9b24a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 10:51:03 +0100 Subject: [PATCH 130/295] cleanup --- test/main.h | 7 +++++++ test/symbolic_index.cpp | 38 -------------------------------------- 2 files changed, 7 insertions(+), 38 deletions(-) diff --git a/test/main.h b/test/main.h index 36784b1f4..9c1465e9a 100644 --- a/test/main.h +++ b/test/main.h @@ -389,6 +389,13 @@ inline void verify_impl(bool condition, const char *testname, const char *file, namespace Eigen { +template +typename internal::enable_if::value,bool>::type +is_same_type(const T1&, const T2&) +{ + return true; +} + template inline typename NumTraits::Real test_precision() { return NumTraits::dummy_precision(); } template<> inline float test_precision() { return 1e-3f; } template<> inline double test_precision() { return 1e-6; } diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp index ea73e99e9..b114cbb95 100644 --- a/test/symbolic_index.cpp +++ b/test/symbolic_index.cpp @@ -19,44 +19,6 @@ #include "main.h" -template -bool match(const T& xpr, std::string ref, std::string str_xpr = "") { - EIGEN_UNUSED_VARIABLE(str_xpr); - std::stringstream str; - str << xpr; - if(!(str.str() == ref)) - std::cout << str_xpr << "\n" << xpr << "\n\n"; - return str.str() == ref; -} - -#define MATCH(X,R) match(X, R, #X) - -template -typename internal::enable_if::value,bool>::type -is_same_fixed(const T1& a, const T2& b) -{ - return (Index(a) == Index(b)); -} - -template -bool is_same_seq(const T1& a, const T2& b) -{ - bool ok = a.first()==b.first() && a.size() == b.size() && Index(a.incrObject())==Index(b.incrObject());; - if(!ok) - { - std::cerr << "seqN(" << a.first() << ", " << a.size() << ", " << Index(a.incrObject()) << ") != "; - std::cerr << "seqN(" << b.first() << ", " << b.size() << ", " << Index(b.incrObject()) << ")\n"; - } - return ok; -} - -template -typename internal::enable_if::value,bool>::type -is_same_type(const T1&, const T2&) -{ - return true; -} - template bool is_same_symb(const T1& a, const T2& b, Index size) { From 6cf7afa3d93b9e51e2a56b2448999a3a71271d06 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 11:04:37 +0100 Subject: [PATCH 131/295] Typo --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 982b98b50..3a8001e8f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -568,7 +568,7 @@ || (EIGEN_COMP_CLANG>=305) \ || (EIGEN_COMP_ICC>=1500) \ || (EIGEN_COMP_PGI>=1500) \ - || (EIGEN_COMP_SUN>=0x5130)) + || (EIGEN_COMP_SUNCC>=0x5130)) #define EIGEN_HAS_ALIGNAS 1 #else #define EIGEN_HAS_ALIGNAS 0 From 32d7232aec1b5c78061548a00f0583ddd693e3e3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 11:18:48 +0100 Subject: [PATCH 132/295] fix always true warning with gcc 4.7 --- test/numext.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/numext.cpp b/test/numext.cpp index 6307f5979..8c6447d40 100644 --- a/test/numext.cpp +++ b/test/numext.cpp @@ -12,6 +12,7 @@ template void check_abs() { typedef typename NumTraits::Real Real; + Real zero(0); if(NumTraits::IsSigned) VERIFY_IS_EQUAL(numext::abs(-T(1)), T(1)); @@ -26,9 +27,9 @@ void check_abs() { if(NumTraits::IsSigned) { VERIFY_IS_EQUAL(numext::abs(x), numext::abs(-x)); - VERIFY( numext::abs(-x) >= Real(0)); + VERIFY( numext::abs(-x) >= zero ); } - VERIFY( numext::abs(x) >= Real(0)); + VERIFY( numext::abs(x) >= zero ); VERIFY_IS_APPROX( numext::abs2(x), numext::abs2(numext::abs(x)) ); } } From f8bc5cb39e2814d171901e45c1d0ebfeaec49e65 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 15:09:49 +0100 Subject: [PATCH 133/295] Fix detection of vector-at-time: use Rows/Cols instead of MaxRow/MaxCols. This fix VectorXd(n).middleCol(0,0).outerSize() which was equal to 1. --- Eigen/src/Core/DenseBase.h | 4 ++-- test/block.cpp | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index f8feefa27..65ec1f54b 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -150,8 +150,8 @@ template class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits::RowsAtCompileTime == 1 + || internal::traits::ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with diff --git a/test/block.cpp b/test/block.cpp index 27b60d778..84124aba6 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -227,6 +227,16 @@ template void block(const MatrixType& m) VERIFY_IS_APPROX( (m1+m1).template subVector(c1), (m1+m1).col(c1) ); VERIFY_IS_EQUAL( m1.template subVectors(), m1.rows() ); VERIFY_IS_EQUAL( m1.template subVectors(), m1.cols() ); + + if (rows>=2 || cols>=2) { + VERIFY_IS_EQUAL( int(m1.middleCols(0,0).IsRowMajor), int(m1.IsRowMajor) ); + VERIFY_IS_EQUAL( m1.middleCols(0,0).outerSize(), m1.IsRowMajor ? rows : 0); + VERIFY_IS_EQUAL( m1.middleCols(0,0).innerSize(), m1.IsRowMajor ? 0 : rows); + + VERIFY_IS_EQUAL( int(m1.middleRows(0,0).IsRowMajor), int(m1.IsRowMajor) ); + VERIFY_IS_EQUAL( m1.middleRows(0,0).outerSize(), m1.IsRowMajor ? 0 : cols); + VERIFY_IS_EQUAL( m1.middleRows(0,0).innerSize(), m1.IsRowMajor ? cols : 0); + } } @@ -287,11 +297,14 @@ EIGEN_DECLARE_TEST(block) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( block(Matrix()) ); + CALL_SUBTEST_1( block(Matrix(internal::random(2,50))) ); + CALL_SUBTEST_1( block(Matrix(internal::random(2,50))) ); CALL_SUBTEST_2( block(Matrix4d()) ); - CALL_SUBTEST_3( block(MatrixXcf(3, 3)) ); - CALL_SUBTEST_4( block(MatrixXi(8, 12)) ); - CALL_SUBTEST_5( block(MatrixXcd(20, 20)) ); - CALL_SUBTEST_6( block(MatrixXf(20, 20)) ); + CALL_SUBTEST_3( block(MatrixXcf(internal::random(2,50), internal::random(2,50))) ); + CALL_SUBTEST_4( block(MatrixXi(internal::random(2,50), internal::random(2,50))) ); + CALL_SUBTEST_5( block(MatrixXcd(internal::random(2,50), internal::random(2,50))) ); + CALL_SUBTEST_6( block(MatrixXf(internal::random(2,50), internal::random(2,50))) ); + CALL_SUBTEST_7( block(Matrix(internal::random(2,50), internal::random(2,50))) ); CALL_SUBTEST_8( block(Matrix(3, 4)) ); From 027e44ed24f39697263263dfc7193d8fd9feeba8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 15:13:24 +0100 Subject: [PATCH 134/295] bug #1592: makes partial min/max reductions trigger an assertion on inputs with a zero reduction length (+doc and tests) --- Eigen/src/Core/VectorwiseOp.h | 36 ++++++++++++++++++++++++++++++++--- test/vectorwiseop.cpp | 22 +++++++++++++++++++++ test/zerosized.cpp | 4 ++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index a88b6e736..ea0a092a5 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -173,6 +173,14 @@ struct member_redux { * Example: \include MatrixBase_colwise_iterator_cxx11.cpp * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out * + * For a partial reduction on an empty input, some rules apply. + * For the sake of clarity, let's consider a vertical reduction: + * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. + * - Otherwise, if the number of rows is zero, then + * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) + * - a row vector of ones is returned for a product reduction (e.g., MatrixXd(n,0).colwise().prod()) + * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) + * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template class VectorwiseOp @@ -294,13 +302,19 @@ template class VectorwiseOp * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template EIGEN_DEVICE_FUNC const typename ReduxReturnType::Type redux(const BinaryOp& func = BinaryOp()) const - { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); + } typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; @@ -325,6 +339,9 @@ template class VectorwiseOp /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -333,11 +350,17 @@ template class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { return MinCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MinCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -346,7 +369,10 @@ template class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { return MaxCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MaxCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -690,6 +716,10 @@ template class VectorwiseOp const HNormalizedReturnType hnormalized() const; protected: + Index redux_length() const + { + return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); + } ExpressionTypeNested m_matrix; }; diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 37dbcf970..4b9d2d570 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -134,6 +134,7 @@ template void vectorwiseop_matrix(const MatrixType& m) typedef Matrix RowVectorType; typedef Matrix RealColVectorType; typedef Matrix RealRowVectorType; + typedef Matrix MatrixX; Index rows = m.rows(); Index cols = m.cols(); @@ -247,6 +248,26 @@ template void vectorwiseop_matrix(const MatrixType& m) m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())); VERIFY_IS_APPROX( m1, m2 ); VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) ); + + // test empty expressions + VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().sum().eval(), MatrixX::Zero(rows,1)); + VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().sum().eval(), MatrixX::Zero(1,cols)); + VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().sum().eval(), MatrixX::Zero(rows,1)); + VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().sum().eval(), MatrixX::Zero(1,cols)); + + VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().prod().eval(), MatrixX::Ones(rows,1)); + VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().prod().eval(), MatrixX::Ones(1,cols)); + VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().prod().eval(), MatrixX::Ones(rows,1)); + VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().prod().eval(), MatrixX::Ones(1,cols)); + + VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().squaredNorm().eval(), MatrixX::Zero(rows,1)); + + VERIFY_RAISES_ASSERT(m1.real().middleCols(0,0).rowwise().minCoeff().eval()); + VERIFY_RAISES_ASSERT(m1.real().middleRows(0,0).colwise().maxCoeff().eval()); + VERIFY_IS_EQUAL(m1.real().middleRows(0,0).rowwise().maxCoeff().eval().rows(),0); + VERIFY_IS_EQUAL(m1.real().middleCols(0,0).colwise().maxCoeff().eval().cols(),0); + VERIFY_IS_EQUAL(m1.real().middleRows(0,fix<0>).rowwise().maxCoeff().eval().rows(),0); + VERIFY_IS_EQUAL(m1.real().middleCols(0,fix<0>).colwise().maxCoeff().eval().cols(),0); } EIGEN_DECLARE_TEST(vectorwiseop) @@ -256,6 +277,7 @@ EIGEN_DECLARE_TEST(vectorwiseop) CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) ); CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) ); CALL_SUBTEST_5( vectorwiseop_matrix(Matrix4f()) ); + CALL_SUBTEST_5( vectorwiseop_matrix(Vector4f()) ); CALL_SUBTEST_5( vectorwiseop_matrix(Matrix()) ); CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random(1,EIGEN_TEST_MAX_SIZE))) ); diff --git a/test/zerosized.cpp b/test/zerosized.cpp index edd1f6925..6be136e25 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -16,9 +16,13 @@ template void zeroReduction(const MatrixType& m) { VERIFY(!m.any()); VERIFY(m.prod()==1); VERIFY(m.sum()==0); + VERIFY(m.norm()==0); + VERIFY(m.squaredNorm()==0); VERIFY(m.count()==0); VERIFY(m.allFinite()); VERIFY(!m.hasNaN()); + VERIFY_RAISES_ASSERT( m.minCoeff() ); + VERIFY_RAISES_ASSERT( m.maxCoeff() ); } From 6ec6bf0b0d405ec8c597368d089a292d12f9b39e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 15:21:14 +0100 Subject: [PATCH 135/295] Enable visitor on empty matrices (the visitor is left unchanged), and protect min/maxCoeff(Index*,Index*) on empty matrices by an assertion (+ doc & unit tests) --- Eigen/src/Core/Redux.h | 4 ++++ Eigen/src/Core/Visitor.h | 33 +++++++++++++++++++++++++++++++++ test/zerosized.cpp | 5 +++++ 3 files changed, 42 insertions(+) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 720b6030c..e231a7d7d 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -397,6 +397,8 @@ public: * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template @@ -415,6 +417,7 @@ DenseBase::redux(const Func& func) const } /** \returns the minimum of all coefficients of \c *this. + * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template @@ -425,6 +428,7 @@ DenseBase::minCoeff() const } /** \returns the maximum of all coefficients of \c *this. + * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 54c1883d9..b11d87e9f 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -40,6 +40,14 @@ struct visitor_impl } }; +// This specialization enables visitors on empty matrices at compile-time +template +struct visitor_impl { + EIGEN_DEVICE_FUNC + static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) + {} +}; + template struct visitor_impl { @@ -98,6 +106,8 @@ protected: * * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. + * + * \note if the matrix is empty, then the visitor is left unchanged. * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ @@ -106,6 +116,9 @@ template EIGEN_DEVICE_FUNC void DenseBase::visit(Visitor& visitor) const { + if(size()==0) + return; + typedef typename internal::visitor_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); @@ -196,6 +209,9 @@ struct functor_traits > { /** \fn DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() @@ -206,6 +222,8 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; @@ -214,6 +232,9 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() @@ -224,6 +245,8 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::min_coeff_visitor minVisitor; this->visit(minVisitor); @@ -233,6 +256,9 @@ DenseBase::minCoeff(IndexType* index) const /** \fn DenseBase::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() @@ -243,6 +269,8 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; @@ -251,6 +279,9 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() @@ -261,6 +292,8 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); diff --git a/test/zerosized.cpp b/test/zerosized.cpp index 6be136e25..07afd0f86 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -23,6 +23,11 @@ template void zeroReduction(const MatrixType& m) { VERIFY(!m.hasNaN()); VERIFY_RAISES_ASSERT( m.minCoeff() ); VERIFY_RAISES_ASSERT( m.maxCoeff() ); + Index i,j; + VERIFY_RAISES_ASSERT( m.minCoeff(&i,&j) ); + VERIFY_RAISES_ASSERT( m.maxCoeff(&i,&j) ); + VERIFY_RAISES_ASSERT( m.reshaped().minCoeff(&i) ); + VERIFY_RAISES_ASSERT( m.reshaped().maxCoeff(&i) ); } From 2c2c114995a1783883a882b83343a0533d2ebaf5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 16:53:15 +0100 Subject: [PATCH 136/295] Silent maybe-uninitialized warnings by gcc --- Eigen/src/Core/Visitor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index b11d87e9f..f67d83bd1 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -137,6 +137,8 @@ namespace internal { template struct coeff_visitor { + // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; From 2b70b2f5708fdedf24c5d47768c2b24019c48311 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 15 Jan 2019 22:50:42 +0100 Subject: [PATCH 137/295] Make Transform::rotation() an alias to Transform::linear() in the case of an Isometry --- Eigen/src/Geometry/Transform.h | 35 +++++++++++++++++++++++++++++----- test/geo_transformations.cpp | 4 ++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 3670767aa..3090351a0 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -605,7 +605,9 @@ public: template EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase& r) const; - EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const; + typedef typename internal::conditional::type RotationReturnType; + EIGEN_DEVICE_FUNC RotationReturnType rotation() const; + template EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const; @@ -1049,20 +1051,43 @@ EIGEN_DEVICE_FUNC inline Transform Transform struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + const typename TransformType::LinearMatrixType run(const TransformType& t) + { + typedef typename TransformType::LinearMatrixType LinearMatrixType; + LinearMatrixType result; + t.computeRotationScaling(&result, (LinearMatrixType*)0); + return result; + } +}; +template<> struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + typename TransformType::ConstLinearPart run(const TransformType& t) + { + return t.linear(); + } +}; +} /** \returns the rotation part of the transformation * + * If Mode==Isometry, then this method is an alias for linear(), + * otherwise it calls computeRotationScaling() to extract the rotation + * through a SVD decomposition. * * \svd_module * * \sa computeRotationScaling(), computeScalingRotation(), class SVD */ template -EIGEN_DEVICE_FUNC const typename Transform::LinearMatrixType +EIGEN_DEVICE_FUNC +typename Transform::RotationReturnType Transform::rotation() const { - LinearMatrixType result; - computeRotationScaling(&result, (LinearMatrixType*)0); - return result; + return internal::transform_rotation_impl::run(*this); } diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index 25f1d9aa0..c72267955 100755 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -666,6 +666,10 @@ template void transformations_no_scale() VERIFY((m3 * m3.inverse()).isIdentity(test_precision())); // Verify implicit last row is initialized. VERIFY_IS_APPROX(Vector4(m3.row(3)), Vector4(0.0, 0.0, 0.0, 1.0)); + + VERIFY_IS_APPROX(t3.rotation(), t3.linear()); + if(Mode==Isometry) + VERIFY(t3.rotation().data()==t3.linear().data()); } EIGEN_DECLARE_TEST(geo_transformations) From dbfcceabf50db9c1dc6d82863aa9670a1b53c0a4 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Wed, 16 Jan 2019 12:51:36 +0800 Subject: [PATCH 138/295] Bug: 1633: refactor gebp kernel and optimize for neon --- .../Core/products/GeneralBlockPanelKernel.h | 441 +++++++++++------- 1 file changed, 277 insertions(+), 164 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d6dd9dc17..bfc7d1979 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -347,6 +347,14 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template +struct RhsPanelHelper { + private: + typedef typename conditional<(registers_taken < 15), RhsPacket, RhsPacketx4>::type inter_type; + public: + typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; +}; + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -404,29 +412,42 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -444,8 +465,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const FixedInt&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -460,6 +481,13 @@ public: #endif } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -511,6 +539,14 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -523,6 +559,20 @@ public: { dest = pset1(*b); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { @@ -554,18 +604,8 @@ public: dest = ploadu(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -586,6 +626,13 @@ public: c += a * b; } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { @@ -708,6 +755,14 @@ public: typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; + + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -730,6 +785,31 @@ public: dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const + { + loadRhs(b, dest); + } + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { @@ -739,31 +819,6 @@ public: { loadQuadToDoublePacket(b,dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); - } - - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) - { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - } // nothing special here EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -777,17 +832,25 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const FixedInt&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { c = cj.pmadd(a,b,c); } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } @@ -860,6 +923,14 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -872,18 +943,20 @@ public: { dest = pset1(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1(b+0); -// b1 = pload1(b+1); -// } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -901,8 +974,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -924,6 +997,13 @@ public: c += a * b; } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { @@ -932,7 +1012,7 @@ public: } protected: - + }; @@ -944,27 +1024,54 @@ struct gebp_traits { typedef float RhsPacket; - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); - } + typedef float32x4_t RhsPacketx4; EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { - c = vfmaq_n_f32(c, a, b); + c += a * b; + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<1>&) const + { + c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<2>&) const + { + c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<3>&) const + { + c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); } }; @@ -986,6 +1093,9 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper::type RhsPanel15; typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; @@ -1075,7 +1185,7 @@ struct last_row_process_16_packets); blB += SwappedTraits::LhsProgress/4; blA += 1; } @@ -1166,36 +1276,39 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1215,7 +1328,7 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1397,7 +1510,7 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1406,24 +1519,24 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1443,7 +1556,7 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1596,19 +1709,19 @@ void gebp_kernel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ } while(false) internal::prefetch(blB+(48+0)); @@ -1630,7 +1743,7 @@ void gebp_kernel); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ } while(false); EIGEN_GEBGP_ONESTEP(0); @@ -1763,15 +1876,15 @@ void gebp_kernel); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1784,7 +1897,7 @@ void gebp_kernel); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1808,7 +1921,7 @@ void gebp_kernel); straits.acc(c0, alphav, R); } else From 0b466b69336497628de8216ff797369b009a2946 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 13:50:13 +0100 Subject: [PATCH 139/295] bug #1633: use proper type for madd temporaries, factorize RhsPacketx4. --- .../Core/products/GeneralBlockPanelKernel.h | 113 ++++++++---------- 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bfc7d1979..04b7bfa7e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -355,6 +355,16 @@ struct RhsPanelHelper { typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; }; +template +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -412,14 +422,7 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; - + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -465,8 +468,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -481,10 +484,9 @@ public: #endif } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -539,13 +541,7 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -604,8 +600,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -626,10 +622,9 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -756,13 +751,8 @@ public: typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; + // this actualy holds 8 packets! + typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -807,9 +797,7 @@ public: loadRhs(b, dest); } - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - { - } + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { @@ -832,23 +820,22 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -922,15 +909,7 @@ public: typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; - + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -974,8 +953,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -997,10 +976,9 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -1054,22 +1032,22 @@ struct gebp_traits c += a * b; } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<0>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<1>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const { c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<2>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const { c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<3>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const { c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); } @@ -1277,7 +1255,8 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1556,7 +1536,8 @@ void gebp_kernel Date: Wed, 16 Jan 2019 14:33:45 +0100 Subject: [PATCH 140/295] bug #1646: disable aliasing detection for empty and 1x1 expression --- Eigen/src/Core/Transpose.h | 3 ++- test/adjoint.cpp | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index d7c204579..6255a2f6d 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -392,7 +392,8 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - internal::checkTransposeAliasing_impl::run(dst, src); + if(src.size()>1) + internal::checkTransposeAliasing_impl::run(dst, src); } } // end namespace internal diff --git a/test/adjoint.cpp b/test/adjoint.cpp index 4e1e4b5e8..7c8081ec4 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -171,6 +171,17 @@ void adjoint_extra() c = MatrixXd::Ones(10,10) * 1.0 + c; c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) ); c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10); + + // regression for bug 1646 + for (int j = 0; j < 10; ++j) { + c.col(j).head(j) = c.row(j).head(j); + } + + a.conservativeResize(1,1); + a = a.transpose(); + + a.conservativeResize(0,0); + a = a.transpose(); } EIGEN_DECLARE_TEST(adjoint) From aeffdf909eb79dace9f12d0c38583f0dad106f54 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 15:24:59 +0100 Subject: [PATCH 141/295] bug #1617: add unit tests for empty triangular solve. --- test/product_trsolve.cpp | 13 +++++++++++++ test/sparse_solvers.cpp | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp index 0c22cccf6..c927cb635 100644 --- a/test/product_trsolve.cpp +++ b/test/product_trsolve.cpp @@ -71,6 +71,19 @@ template void trsolve(int size=Size,int cols int c = internal::random(0,cols-1); VERIFY_TRSM(rmLhs.template triangularView(), rmRhs.col(c)); VERIFY_TRSM(cmLhs.template triangularView(), rmRhs.col(c)); + + if(Size==Dynamic) + { + cmLhs.resize(0,0); + cmRhs.resize(0,cmRhs.cols()); + Matrix res = cmLhs.template triangularView().solve(cmRhs); + VERIFY_IS_EQUAL(res.rows(),0); + VERIFY_IS_EQUAL(res.cols(),cmRhs.cols()); + res = cmRhs; + cmLhs.template triangularView().solveInPlace(res); + VERIFY_IS_EQUAL(res.rows(),0); + VERIFY_IS_EQUAL(res.cols(),cmRhs.cols()); + } } EIGEN_DECLARE_TEST(product_trsolve) diff --git a/test/sparse_solvers.cpp b/test/sparse_solvers.cpp index aaf3d39c9..3b7cd7788 100644 --- a/test/sparse_solvers.cpp +++ b/test/sparse_solvers.cpp @@ -98,6 +98,19 @@ template void sparse_solvers(int rows, int cols) initSparse(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords); VERIFY_IS_APPROX(refMat2.template triangularView().solve(vec2), m2.template triangularView().solve(vec3)); + + // test empty triangular matrix + { + m2.resize(0,0); + refMatB.resize(0,refMatB.cols()); + DenseMatrix res = m2.template triangularView().solve(refMatB); + VERIFY_IS_EQUAL(res.rows(),0); + VERIFY_IS_EQUAL(res.cols(),refMatB.cols()); + res = refMatB; + m2.template triangularView().solveInPlace(res); + VERIFY_IS_EQUAL(res.rows(),0); + VERIFY_IS_EQUAL(res.cols(),refMatB.cols()); + } } } From c8e40edac9912a76904f5d302ea805bf53957123 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 16:27:00 +0100 Subject: [PATCH 142/295] Remove Eigen2ToEigen3 migration page (obsolete since 3.3) --- doc/A05_PortingFrom2To3.dox | 299 ------------------------------------ doc/Manual.dox | 1 - test/adjoint.cpp | 4 + 3 files changed, 4 insertions(+), 300 deletions(-) delete mode 100644 doc/A05_PortingFrom2To3.dox diff --git a/doc/A05_PortingFrom2To3.dox b/doc/A05_PortingFrom2To3.dox deleted file mode 100644 index 51555f996..000000000 --- a/doc/A05_PortingFrom2To3.dox +++ /dev/null @@ -1,299 +0,0 @@ -namespace Eigen { - -/** \page Eigen2ToEigen3 Porting from Eigen2 to Eigen3 - -This page lists the most important API changes between Eigen2 and Eigen3, -and gives tips to help porting your application from Eigen2 to Eigen3. - -\eigenAutoToc - -\section CompatibilitySupport Eigen2 compatibility support - -Up to version 3.2 %Eigen provides Eigen2 support modes. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs. -You can still use them by first porting your code to Eigen 3.2. - -\section Using The USING_PART_OF_NAMESPACE_EIGEN macro - -The USING_PART_OF_NAMESPACE_EIGEN macro has been removed. In Eigen 3, just do: -\code -using namespace Eigen; -\endcode - -\section ComplexDot Dot products over complex numbers - -This is the single trickiest change between Eigen 2 and Eigen 3. It only affects code using \c std::complex numbers as scalar type. - -Eigen 2's dot product was linear in the first variable. Eigen 3's dot product is linear in the second variable. In other words, the Eigen 2 code \code x.dot(y) \endcode is equivalent to the Eigen 3 code \code y.dot(x) \endcode In yet other words, dot products are complex-conjugated in Eigen 3 compared to Eigen 2. The switch to the new convention was commanded by common usage, especially with the notation \f$ x^Ty \f$ for dot products of column-vectors. - -\section VectorBlocks Vector blocks - - - - -
Eigen 2Eigen 3
\code -vector.start(length) -vector.start() -vector.end(length) -vector.end() -\endcode\code -vector.head(length) -vector.head() -vector.tail(length) -vector.tail() -\endcode
- - -\section Corners Matrix Corners - - - - - -
Eigen 2Eigen 3
\code -matrix.corner(TopLeft,r,c) -matrix.corner(TopRight,r,c) -matrix.corner(BottomLeft,r,c) -matrix.corner(BottomRight,r,c) -matrix.corner(TopLeft) -matrix.corner(TopRight) -matrix.corner(BottomLeft) -matrix.corner(BottomRight) -\endcode\code -matrix.topLeftCorner(r,c) -matrix.topRightCorner(r,c) -matrix.bottomLeftCorner(r,c) -matrix.bottomRightCorner(r,c) -matrix.topLeftCorner() -matrix.topRightCorner() -matrix.bottomLeftCorner() -matrix.bottomRightCorner() -\endcode
- -Notice that Eigen3 also provides these new convenience methods: topRows(), bottomRows(), leftCols(), rightCols(). See in class DenseBase. - -\section CoefficientWiseOperations Coefficient wise operations - -In Eigen2, coefficient wise operations which have no proper mathematical definition (as a coefficient wise product) -were achieved using the .cwise() prefix, e.g.: -\code a.cwise() * b \endcode -In Eigen3 this .cwise() prefix has been superseded by a new kind of matrix type called -Array for which all operations are performed coefficient wise. You can easily view a matrix as an array and vice versa using -the MatrixBase::array() and ArrayBase::matrix() functions respectively. Here is an example: -\code -Vector4f a, b, c; -c = a.array() * b.array(); -\endcode -Note that the .array() function is not at all a synonym of the deprecated .cwise() prefix. -While the .cwise() prefix changed the behavior of the following operator, the array() function performs -a permanent conversion to the array world. Therefore, for binary operations such as the coefficient wise product, -both sides must be converted to an \em array as in the above example. On the other hand, when you -concatenate multiple coefficient wise operations you only have to do the conversion once, e.g.: -\code -Vector4f a, b, c; -c = a.array().abs().pow(3) * b.array().abs().sin(); -\endcode -With Eigen2 you would have written: -\code -c = (a.cwise().abs().cwise().pow(3)).cwise() * (b.cwise().abs().cwise().sin()); -\endcode - -\section PartAndExtract Triangular and self-adjoint matrices - -In Eigen 2 you had to play with the part, extract, and marked functions to deal with triangular and selfadjoint matrices. In Eigen 3, all these functions have been removed in favor of the concept of \em views: - - - - - - - - - - - - - - - -
Eigen 2Eigen 3
\code -A.part(); -A.part(); \endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.extract(); -A.extract();\endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.marked(); -A.marked();\endcode\code -A.triangularView() -A.triangularView()\endcode
\code -A.part(); -A.extract();\endcode\code -A.selfadjointView() -A.selfadjointView()\endcode
\code -UpperTriangular -LowerTriangular -UnitUpperTriangular -UnitLowerTriangular -StrictlyUpperTriangular -StrictlyLowerTriangular -\endcode\code -Upper -Lower -UnitUpper -UnitLower -StrictlyUpper -StrictlyLower -\endcode
- -\sa class TriangularView, class SelfAdjointView - -\section TriangularSolveInPlace Triangular in-place solving - - - - -
Eigen 2Eigen 3
\code A.triangularSolveInPlace(Y);\endcode\code A.triangularView().solveInPlace(Y);\endcode
- - -\section Decompositions Matrix decompositions - -Some of Eigen 2's matrix decompositions have been renamed in Eigen 3, while some others have been removed and are replaced by other decompositions in Eigen 3. - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Eigen 2Eigen 3Notes
LUFullPivLUSee also the new PartialPivLU, it's much faster
QRHouseholderQRSee also the new ColPivHouseholderQR, it's more reliable
SVDJacobiSVDWe currently don't have a bidiagonalizing SVD; of course this is planned.
EigenSolver and friends\code #include \endcode Moved to separate module
- -\section LinearSolvers Linear solvers - - - - - - - - - - - - - - - - - - -
Eigen 2Eigen 3Notes
\code A.lu();\endcode\code A.fullPivLu();\endcodeNow A.lu() returns a PartialPivLU
\code A.lu().solve(B,&X);\endcode\code X = A.lu().solve(B); - X = A.fullPivLu().solve(B);\endcodeThe returned by value is fully optimized
\code A.llt().solve(B,&X);\endcode\code X = A.llt().solve(B); - X = A.selfadjointView.llt().solve(B); - X = A.selfadjointView.llt().solve(B);\endcodeThe returned by value is fully optimized and \n -the selfadjointView API allows you to select the \n -triangular part to work on (default is lower part)
\code A.llt().solveInPlace(B);\endcode\code B = A.llt().solve(B); - B = A.selfadjointView.llt().solve(B); - B = A.selfadjointView.llt().solve(B);\endcodeIn place solving
\code A.ldlt().solve(B,&X);\endcode\code X = A.ldlt().solve(B); - X = A.selfadjointView.ldlt().solve(B); - X = A.selfadjointView.ldlt().solve(B);\endcodeThe returned by value is fully optimized and \n -the selfadjointView API allows you to select the \n -triangular part to work on
- -\section GeometryModule Changes in the Geometry module - -The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the "Eigen 2 support modes" to perform your migration. - -\section Transform The Transform class - -In Eigen 2, the Transform class didn't really know whether it was a projective or affine transformation. In Eigen 3, it takes a new \a Mode template parameter, which indicates whether it's \a Projective or \a Affine transform. There is no default value. - -The Transform3f (etc) typedefs are no more. In Eigen 3, the Transform typedefs explicitly refer to the \a Projective and \a Affine modes: - - - - - - - - -
Eigen 2Eigen 3Notes
Transform3f Affine3f or Projective3f Of course 3f is just an example here
- - -\section LazyVsNoalias Lazy evaluation and noalias - -In Eigen all operations are performed in a lazy fashion except the matrix products which are always evaluated into a temporary by default. -In Eigen2, lazy evaluation could be enforced by tagging a product using the .lazy() function. However, in complex expressions it was not -easy to determine where to put the lazy() function. In Eigen3, the lazy() feature has been superseded by the MatrixBase::noalias() function -which can be used on the left hand side of an assignment when no aliasing can occur. Here is an example: -\code -MatrixXf a, b, c; -... -c.noalias() += 2 * a.transpose() * b; -\endcode -However, the noalias mechanism does not cover all the features of the old .lazy(). Indeed, in some extremely rare cases, -it might be useful to explicit request for a lay product, i.e., for a product which will be evaluated one coefficient at once, on request, -just like any other expressions. To this end you can use the MatrixBase::lazyProduct() function, however we strongly discourage you to -use it unless you are sure of what you are doing, i.e., you have rigourosly measured a speed improvement. - -\section AlignMacros Alignment-related macros - -The EIGEN_ALIGN_128 macro has been renamed to EIGEN_ALIGN16. Don't be surprised, it's just that we switched to counting in bytes ;-) - -The \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN \endlink option still exists in Eigen 3, but it has a new cousin: \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY.\endlink It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays. Vectorization of statically allocated arrays is still preserved (unless you define \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink =0), at the cost of unaligned memory stores. - -\section AlignedMap Aligned Map objects - -A common issue with Eigen 2 was that when mapping an array with Map, there was no way to tell Eigen that your array was aligned. There was a ForceAligned option but it didn't mean that; it was just confusing and has been removed. - -New in Eigen3 is the #Aligned option. See the documentation of class Map. Use it like this: -\code -Map myMappedVector(some_aligned_array); -\endcode -There also are related convenience static methods, which actually are the preferred way as they take care of such things as constness: -\code -result = Vector4f::MapAligned(some_aligned_array); -\endcode - -\section StdContainers STL Containers - -In Eigen2, \#include\ tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator as your allocator type. So for example, if you use std::vector, you need to do the following change (note that aligned_allocator is under namespace Eigen): - - - - - - - -
Eigen 2Eigen 3
\code std::vector \endcode \code std::vector > \endcode
- -\section eiPrefix Internal ei_ prefix - -In Eigen2, global internal functions and structures were prefixed by \c ei_. In Eigen3, they all have been moved into the more explicit \c internal namespace. So, e.g., \c ei_sqrt(x) now becomes \c internal::sqrt(x). Of course it is not recommended to rely on Eigen's internal features. - - - -*/ - -} diff --git a/doc/Manual.dox b/doc/Manual.dox index 194164e97..84f0db645 100644 --- a/doc/Manual.dox +++ b/doc/Manual.dox @@ -15,7 +15,6 @@ namespace Eigen { /** \page UserManual_Generalities General topics - - \subpage Eigen2ToEigen3 - \subpage TopicFunctionTakingEigenTypes - \subpage TopicPreprocessorDirectives - \subpage TopicAssertions diff --git a/test/adjoint.cpp b/test/adjoint.cpp index 7c8081ec4..e2bfa6d7d 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -177,6 +177,10 @@ void adjoint_extra() c.col(j).head(j) = c.row(j).head(j); } + for (int j = 0; j < 10; ++j) { + c.col(j) = c.row(j); + } + a.conservativeResize(1,1); a = a.transpose(); From 729d1291c229da60cd7f96c71a734994e7cf6f27 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 16:28:17 +0100 Subject: [PATCH 143/295] bug #1585: update doc on lazy-evaluation --- doc/TopicLazyEvaluation.dox | 76 ++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox index b7820e3e6..d2a704f13 100644 --- a/doc/TopicLazyEvaluation.dox +++ b/doc/TopicLazyEvaluation.dox @@ -2,63 +2,95 @@ namespace Eigen { /** \page TopicLazyEvaluation Lazy Evaluation and Aliasing -Executive summary: Eigen has intelligent compile-time mechanisms to enable lazy evaluation and removing temporaries where appropriate. +Executive summary: %Eigen has intelligent compile-time mechanisms to enable lazy evaluation and removing temporaries where appropriate. It will handle aliasing automatically in most cases, for example with matrix products. The automatic behavior can be overridden manually by using the MatrixBase::eval() and MatrixBase::noalias() methods. When you write a line of code involving a complex expression such as -\code mat1 = mat2 + mat3 * (mat4 + mat5); \endcode +\code mat1 = mat2 + mat3 * (mat4 + mat5); +\endcode -Eigen determines automatically, for each sub-expression, whether to evaluate it into a temporary variable. Indeed, in certain cases it is better to evaluate immediately a sub-expression into a temporary variable, while in other cases it is better to avoid that. +%Eigen determines automatically, for each sub-expression, whether to evaluate it into a temporary variable. Indeed, in certain cases it is better to evaluate a sub-expression into a temporary variable, while in other cases it is better to avoid that. A traditional math library without expression templates always evaluates all sub-expressions into temporaries. So with this code, -\code vec1 = vec2 + vec3; \endcode +\code vec1 = vec2 + vec3; +\endcode a traditional library would evaluate \c vec2 + vec3 into a temporary \c vec4 and then copy \c vec4 into \c vec1. This is of course inefficient: the arrays are traversed twice, so there are a lot of useless load/store operations. -Expression-templates-based libraries can avoid evaluating sub-expressions into temporaries, which in many cases results in large speed improvements. This is called lazy evaluation as an expression is getting evaluated as late as possible, instead of immediately. However, most other expression-templates-based libraries always choose lazy evaluation. There are two problems with that: first, lazy evaluation is not always a good choice for performance; second, lazy evaluation can be very dangerous, for example with matrix products: doing matrix = matrix*matrix gives a wrong result if the matrix product is lazy-evaluated, because of the way matrix product works. +Expression-templates-based libraries can avoid evaluating sub-expressions into temporaries, which in many cases results in large speed improvements. +This is called lazy evaluation as an expression is getting evaluated as late as possible. +In %Eigen all expressions are lazy-evaluated. +More precisely, an expression starts to be evaluated once it is assigned to a matrix. +Until then nothing happens beyond constructing the abstract expression tree. +In contrast to most other expression-templates-based libraries, however, %Eigen might choose to evaluate some sub-expressions into temporaries. +There are two reasons for that: first, pure lazy evaluation is not always a good choice for performance; second, pure lazy evaluation can be very dangerous, for example with matrix products: doing mat = mat*mat gives a wrong result if the matrix product is directly evaluated within the destination matrix, because of the way matrix product works. -For these reasons, Eigen has intelligent compile-time mechanisms to determine automatically when to use lazy evaluation, and when on the contrary it should evaluate immediately into a temporary variable. +For these reasons, %Eigen has intelligent compile-time mechanisms to determine automatically which sub-expression should be evaluated into a temporary variable. So in the basic example, -\code matrix1 = matrix2 + matrix3; \endcode +\code mat1 = mat2 + mat3; +\endcode -Eigen chooses lazy evaluation. Thus the arrays are traversed only once, producing optimized code. If you really want to force immediate evaluation, use \link MatrixBase::eval() eval()\endlink: +%Eigen chooses not to introduce any temporary. Thus the arrays are traversed only once, producing optimized code. +If you really want to force immediate evaluation, use \link MatrixBase::eval() eval()\endlink: -\code matrix1 = (matrix2 + matrix3).eval(); \endcode +\code mat1 = (mat2 + mat3).eval(); +\endcode Here is now a more involved example: -\code matrix1 = -matrix2 + matrix3 + 5 * matrix4; \endcode +\code mat1 = -mat2 + mat3 + 5 * mat4; +\endcode -Eigen chooses lazy evaluation at every stage in that example, which is clearly the correct choice. In fact, lazy evaluation is the "default choice" and Eigen will choose it except in a few circumstances. +Here again %Eigen won't introduce any temporary, thus producing a single fused evaluation loop, which is clearly the correct choice. -The first circumstance in which Eigen chooses immediate evaluation, is when it sees an assignment a = b; and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. The most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do +\section TopicLazyEvaluationWhichExpr Which sub-expressions are evaluated into temporaries? -\code matrix = matrix * matrix; \endcode +The default evaluation strategy is to fuse the operations in a single loop, and %Eigen will choose it except in a few circumstances. -Eigen first evaluates matrix * matrix into a temporary matrix, and then copies it into the original \c matrix. This guarantees a correct result as we saw above that lazy evaluation gives wrong results with matrix products. It also doesn't cost much, as the cost of the matrix product itself is much higher. +The first circumstance in which %Eigen chooses to evaluate a sub-expression is when it sees an assignment a = b; and the expression \c b has the evaluate-before-assigning \link flags flag\endlink. +The most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do + +\code mat = mat * mat; +\endcode + +%Eigen will evaluate mat * mat into a temporary matrix, and then copies it into the original \c mat. +This guarantees a correct result as we saw above that lazy evaluation gives wrong results with matrix products. +It also doesn't cost much, as the cost of the matrix product itself is much higher. +Note that this temporary is introduced at evaluation time only, that is, within operator= in this example. +The expression mat * mat still return a abstract product type. What if you know that the result does no alias the operand of the product and want to force lazy evaluation? Then use \link MatrixBase::noalias() .noalias()\endlink instead. Here is an example: -\code matrix1.noalias() = matrix2 * matrix2; \endcode +\code mat1.noalias() = mat2 * mat2; +\endcode -Here, since we know that matrix2 is not the same matrix as matrix1, we know that lazy evaluation is not dangerous, so we may force lazy evaluation. Concretely, the effect of noalias() here is to bypass the evaluate-before-assigning \link flags flag\endlink. +Here, since we know that mat2 is not the same matrix as mat1, we know that lazy evaluation is not dangerous, so we may force lazy evaluation. Concretely, the effect of noalias() here is to bypass the evaluate-before-assigning \link flags flag\endlink. -The second circumstance in which Eigen chooses immediate evaluation, is when it sees a nested expression such as a + b where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. Again, the most important example of such an expression is the \link Product matrix product expression\endlink. For example, when you do +The second circumstance in which %Eigen chooses to evaluate a sub-expression, is when it sees a nested expression such as a + b where \c b is already an expression having the evaluate-before-nesting \link flags flag\endlink. +Again, the most important example of such an expression is the \link Product matrix product expression\endlink. +For example, when you do -\code matrix1 = matrix2 + matrix3 * matrix4; \endcode +\code mat1 = mat2 * mat3 + mat4 * mat5; +\endcode -the product matrix3 * matrix4 gets evaluated immediately into a temporary matrix. Indeed, experiments showed that it is often beneficial for performance to evaluate immediately matrix products when they are nested into bigger expressions. +the products mat2 * mat3 and mat4 * mat5 gets evaluated separately into temporary matrices before being summed up in mat1. +Indeed, to be efficient matrix products need to be evaluated within a destination matrix at hand, and not as simple "dot products". +For small matrices, however, you might want to enforce a "dot-product" based lazy evaluation with lazyProduct(). +Again, it is important to understand that those temporaries are created at evaluation time only, that is in operator =. +See TopicPitfalls_auto_keyword for common pitfalls regarding this remark. -The third circumstance in which Eigen chooses immediate evaluation, is when its cost model shows that the total cost of an operation is reduced if a sub-expression gets evaluated into a temporary. Indeed, in certain cases, an intermediate result is sufficiently costly to compute and is reused sufficiently many times, that is worth "caching". Here is an example: +The third circumstance in which %Eigen chooses to evaluate a sub-expression, is when its cost model shows that the total cost of an operation is reduced if a sub-expression gets evaluated into a temporary. +Indeed, in certain cases, an intermediate result is sufficiently costly to compute and is reused sufficiently many times, that is worth "caching". Here is an example: -\code matrix1 = matrix2 * (matrix3 + matrix4); \endcode +\code mat1 = mat2 * (mat3 + mat4); +\endcode -Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. +Here, provided the matrices have at least 2 rows and 2 columns, each coefficient of the expression mat3 + mat4 is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. %Eigen understands this and evaluates mat3 + mat4 into a temporary variable before evaluating the product. */ From ce88e297dcae94952f4dbe6386fa63b6962d3432 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 16:29:02 +0100 Subject: [PATCH 144/295] Add a comment stating this doc page is partly obsolete. --- doc/InsideEigenExample.dox | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/InsideEigenExample.dox b/doc/InsideEigenExample.dox index ed053c69d..ea2275bf2 100644 --- a/doc/InsideEigenExample.dox +++ b/doc/InsideEigenExample.dox @@ -212,6 +212,11 @@ Thus, the operator+ hasn't performed any actual computation. To summarize, the o \section Assignment The assignment +
+PLEASE HELP US IMPROVING THIS SECTION. +This page reflects how %Eigen worked until 3.2, but since %Eigen 3.3 the assignment is more sophisticated as it involves an Assignment expression, and the creation of so called evaluator which are responsible for the evaluation of each kind of expressions. +
+ At this point, the expression \a v + \a w has finished evaluating, so, in the process of compiling the line of code \code u = v + w; From 70e133333d733886923cd5a9082e92714b95d076 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 21:22:20 +0100 Subject: [PATCH 145/295] bug #1661: fix regression in GEBP and AVX512 --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 04b7bfa7e..8b4b1dbd3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1307,7 +1307,8 @@ void gebp_kernel Date: Wed, 16 Jan 2019 21:47:42 +0100 Subject: [PATCH 146/295] GEBP: cleanup logic to choose between a 4 packets of 1 packet --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8b4b1dbd3..fdc050e05 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -350,9 +350,9 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ template struct RhsPanelHelper { private: - typedef typename conditional<(registers_taken < 15), RhsPacket, RhsPacketx4>::type inter_type; + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; public: - typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; + typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; }; template From 0f028f61cb4d7731512fff861d8228945c5d965c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 22:26:38 +0100 Subject: [PATCH 147/295] GEBP: fix swapped kernel mode with AVX512 and complex scalars --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fdc050e05..c5d77763a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -600,8 +600,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -820,8 +820,10 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const + template + EIGEN_STRONG_INLINE + typename enable_if::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -953,8 +955,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } From ee550a2ac3fa2f127cc8ab16ee2773aa390b0142 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 16 Jan 2019 14:03:12 -0800 Subject: [PATCH 148/295] Fix flaky test for tensor fft. --- unsupported/test/cxx11_tensor_fft.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 4e4c9c4ec..641486a4a 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -228,9 +228,6 @@ template static void test_fft_non_power_of_2_round_trip(int exponent) { int n = (1 << exponent) + 1; - // The dimension type needs to be at least 8 bytes long for the - // Tensor constructor to work. On Windows, long is only 4 bytes long, - // so use long long here to force the usage of a 8 bytes integer type. Eigen::DSizes dimensions; dimensions[0] = n; const DSizes arr = dimensions; @@ -249,7 +246,9 @@ static void test_fft_non_power_of_2_round_trip(int exponent) { forward.template fft(fft); for (int i = 0; i < n; ++i) { - VERIFY_IS_APPROX(input[i], output[i]); + RealScalar tol = test_precision() * + (std::abs(input[i]) + std::abs(output[i]) + 1); + VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol); } } @@ -301,4 +300,5 @@ EIGEN_DECLARE_TEST(cxx11_tensor_fft) { test_fft_real_input_energy(); test_fft_non_power_of_2_round_trip(7); + test_fft_non_power_of_2_round_trip(7); } From 7401e2541deffd08c61b0426b2bcd21ffd481ac0 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 16 Jan 2019 14:43:33 -0800 Subject: [PATCH 149/295] Fix compilation error for logical packet ops with older compilers. --- Eigen/src/Core/arch/AVX/Complex.h | 8 ++++---- Eigen/src/Core/arch/AVX512/Complex.h | 8 ++++---- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 4 ---- Eigen/src/Core/arch/SSE/Complex.h | 12 ++++++------ 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index dcca35279..16faf1082 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -75,8 +75,8 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } -template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -290,8 +290,8 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } -template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 7bb2fd630..7ea72c509 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -55,8 +55,8 @@ template<> struct unpacket_traits { typedef Packet4cf half; }; -template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); } template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) @@ -270,8 +270,8 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, con return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); } -template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index eab7be14c..020baa353 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,10 +143,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half ptrue(const half& a) { - return __half_raw(0xffffu); -} - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index c3b1de5ce..e51966f0d 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,8 +82,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } @@ -308,8 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -447,13 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { From 562985bac44a06c65f560e2628080d1743bfd77f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 00:14:27 +0100 Subject: [PATCH 150/295] bug #1646: fix false aliasing detection for A.row(0) = A.col(0); This changeset completely disable the detection for vectors for which are current mechanism cannot detect any positive aliasing anyway. --- Eigen/src/Core/Transpose.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 6255a2f6d..91a9ab1b9 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -392,7 +392,7 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - if(src.size()>1) + if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) internal::checkTransposeAliasing_impl::run(dst, src); } From 4759d9e86dea0b4b964e6590e68319cedf6a64e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 10:35:14 +0100 Subject: [PATCH 151/295] Doc: add manual page on STL iterators --- doc/TutorialSTL.dox | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 doc/TutorialSTL.dox diff --git a/doc/TutorialSTL.dox b/doc/TutorialSTL.dox new file mode 100644 index 000000000..9f41528d3 --- /dev/null +++ b/doc/TutorialSTL.dox @@ -0,0 +1,66 @@ +namespace Eigen { + +/** \eigenManualPage TutorialSTL STL iterators and algorithms + +Since the version 3.4, %Eigen's dense matrices and arrays provide STL compatible iterators. +As demonstrated below, this makes them naturally compatible with range-for-loops and STL's algorithms. + +\eigenAutoToc + +\section TutorialSTLVectors Iterating over 1D arrays and vectors + +Any dense 1D expressions exposes the pair of `begin()/end()` methods to iterate over them. + +This directly enables c++11 range for loops: + + + +
Example:Output:
+\include Tutorial_range_for_loop_1d_cxx11.cpp + +\verbinclude Tutorial_range_for_loop_1d_cxx11.out +
+ +One dimensional expressions can also easily be passed to STL algorithms: + + + +
Example:Output:
+\include Tutorial_std_sort.cpp + +\verbinclude Tutorial_std_sort.out +
+ +Similar to `std::vector`, 1D expressions also exposes the pair of `cbegin()/cend()` methods to conveniently get const iterators on non-const object. + +\section TutorialSTLMatrices Iterating over coefficients of 2D arrays and matrices + +STL iterators are intrinsically designed to iterate over 1D structures. +This is why `begin()/end()` methods are disabled for 2D expressions. +Iterating over all coefficients of a 2D expressions is still easily accomplished by creating a 1D linear view through `reshaped()`: + + + +
Example:Output:
+\include Tutorial_range_for_loop_2d_cxx11.cpp + +\verbinclude Tutorial_range_for_loop_2d_cxx11.out +
+ +\section TutorialSTLRowsColumns Iterating over rows or columns of 2D arrays and matrices + +It is also possible to get iterators over rows or columns of 2D expressions. +Those are available through the `rowwise()` and `colwise()` proxies. +Here is an example sorting each row of a matrix: + + + +
Example:Output:
+\include Tutorial_std_sort_rows.cpp + +\verbinclude Tutorial_std_sort_rows.out +
+ +*/ + +} From 7b35c26b1c73e6b1048eda69ab5ef18924770379 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 10:35:50 +0100 Subject: [PATCH 152/295] Doc: remove link to porting guide --- doc/Overview.dox | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/Overview.dox b/doc/Overview.dox index dbb49bd21..43a12871e 100644 --- a/doc/Overview.dox +++ b/doc/Overview.dox @@ -4,8 +4,6 @@ namespace Eigen { This is the API documentation for Eigen3. You can download it as a tgz archive for offline reading. -You're already an Eigen2 user? Here is a \link Eigen2ToEigen3 Eigen2 to Eigen3 guide \endlink to help porting your application. - For a first contact with Eigen, the best place is to have a look at the \link GettingStarted getting started \endlink page that show you how to write and compile your first program with Eigen. Then, the \b quick \b reference \b pages give you a quite complete description of the API in a very condensed format that is specially useful to recall the syntax of a particular feature, or to have a quick look at the API. They currently cover the two following feature sets, and more will come in the future: From 7f32109c11b9cbc3cedc72e59683bf5839d35d75 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 11:33:43 +0100 Subject: [PATCH 153/295] Add conjugateIf members to DesneBase, TriangularView, SelfadjointView, and make PartialPivLU use it. --- Eigen/src/Core/SelfAdjointView.h | 13 +++++++++++++ Eigen/src/Core/TriangularMatrix.h | 13 +++++++++++++ Eigen/src/LU/PartialPivLU.h | 25 ++++++++++--------------- Eigen/src/plugins/CommonCwiseUnaryOps.h | 14 ++++++++++++++ test/adjoint.cpp | 3 +++ test/triangular.cpp | 16 ++++++++++++++++ 6 files changed, 69 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 2cf3fa1ef..2173799d9 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -61,6 +61,7 @@ template class SelfAdjointView typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all::type MatrixConjugateReturnType; + typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, @@ -197,6 +198,18 @@ template class SelfAdjointView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef SelfAdjointView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 521de6160..cf3532f06 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -198,6 +198,7 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixConjugateReturnType; + typedef TriangularView::type, _Mode> ConstTriangularView; public: @@ -243,6 +244,18 @@ template class TriangularView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef TriangularView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index ecc0e748f..ff4be360e 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -246,26 +246,21 @@ template class PartialPivLU template EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView().transpose() + .template conjugateIf().solve(rhs); + // Step 2 + m_lu.template triangularView().transpose() + .template conjugateIf().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h index 89f4faaac..5418dc415 100644 --- a/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -76,6 +76,20 @@ conjugate() const return ConjugateReturnType(derived()); } +/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. +/// +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +/// +/// \sa conjugate() +template +EIGEN_DEVICE_FUNC +inline typename internal::conditional::type +conjugateIf() const +{ + typedef typename internal::conditional::type ReturnType; + return ReturnType(derived()); +} + /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index e2bfa6d7d..4c4f98bb9 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -143,6 +143,9 @@ template void adjoint(const MatrixType& m) RealVectorType rv1 = RealVectorType::Random(rows); VERIFY_IS_APPROX(v1.dot(rv1.template cast()), v1.dot(rv1)); VERIFY_IS_APPROX(rv1.template cast().dot(v1), rv1.dot(v1)); + + VERIFY( is_same_type(m1,m1.template conjugateIf()) ); + VERIFY( is_same_type(m1.conjugate(),m1.template conjugateIf()) ); } template diff --git a/test/triangular.cpp b/test/triangular.cpp index 99ef1dcda..0fca5e3b9 100644 --- a/test/triangular.cpp +++ b/test/triangular.cpp @@ -129,6 +129,22 @@ template void triangular_square(const MatrixType& m) VERIFY_IS_APPROX(m1.template selfadjointView().diagonal(), m1.diagonal()); + m3.setRandom(); + const MatrixType& m3c(m3); + VERIFY( is_same_type(m3c.template triangularView(),m3.template triangularView().template conjugateIf()) ); + VERIFY( is_same_type(m3c.template triangularView().conjugate(),m3.template triangularView().template conjugateIf()) ); + VERIFY_IS_APPROX(m3.template triangularView().template conjugateIf().toDenseMatrix(), + m3.conjugate().template triangularView().toDenseMatrix()); + VERIFY_IS_APPROX(m3.template triangularView().template conjugateIf().toDenseMatrix(), + m3.template triangularView().toDenseMatrix()); + + VERIFY( is_same_type(m3c.template selfadjointView(),m3.template selfadjointView().template conjugateIf()) ); + VERIFY( is_same_type(m3c.template selfadjointView().conjugate(),m3.template selfadjointView().template conjugateIf()) ); + VERIFY_IS_APPROX(m3.template selfadjointView().template conjugateIf().toDenseMatrix(), + m3.conjugate().template selfadjointView().toDenseMatrix()); + VERIFY_IS_APPROX(m3.template selfadjointView().template conjugateIf().toDenseMatrix(), + m3.template selfadjointView().toDenseMatrix()); + } From 15e53d5d93bd79fa415416d3f979975f0014a64d Mon Sep 17 00:00:00 2001 From: Patrick Peltzer Date: Thu, 17 Jan 2019 01:17:39 +0100 Subject: [PATCH 154/295] PR 567: makes all dense solvers inherit SoverBase (LU,Cholesky,QR,SVD). This changeset also includes: * add HouseholderSequence::conjugateIf * define int as the StorageIndex type for all dense solvers * dedicated unit tests, including assertion checking * _check_solve_assertion(): this method can be implemented in derived solver classes to implement custom checks * CompleteOrthogonalDecompositions: add applyZOnTheLeftInPlace, fix scalar type in applyZAdjointOnTheLeftInPlace(), add missing assertions * Cholesky: add missing assertions * FullPivHouseholderQR: Corrected Scalar type in _solve_impl() * BDCSVD: Unambiguous return type for ternary operator * SVDBase: Corrected Scalar type in _solve_impl() --- Eigen/src/Cholesky/LDLT.h | 56 ++++++---- Eigen/src/Cholesky/LLT.h | 47 +++++--- Eigen/src/Core/SolverBase.h | 40 ++++++- Eigen/src/Core/util/ForwardDeclarations.h | 1 + Eigen/src/Householder/HouseholderSequence.h | 18 +++ Eigen/src/LU/FullPivLU.h | 12 +- Eigen/src/LU/PartialPivLU.h | 13 +-- Eigen/src/QR/ColPivHouseholderQR.h | 52 ++++++--- .../src/QR/CompleteOrthogonalDecomposition.h | 105 ++++++++++++++---- Eigen/src/QR/FullPivHouseholderQR.h | 72 +++++++++--- Eigen/src/QR/HouseholderQR.h | 54 +++++++-- Eigen/src/SVD/BDCSVD.h | 3 +- Eigen/src/SVD/JacobiSVD.h | 1 + Eigen/src/SVD/SVDBase.h | 63 ++++++++--- test/bdcsvd.cpp | 2 + test/cholesky.cpp | 49 ++++---- test/jacobisvd.cpp | 2 + test/lu.cpp | 66 +++-------- test/qr.cpp | 16 +-- test/qr_colpivoting.cpp | 64 +++++++---- test/qr_fullpivoting.cpp | 16 +-- test/solverbase.h | 36 ++++++ test/svd_common.h | 27 +++++ 23 files changed, 576 insertions(+), 239 deletions(-) create mode 100644 test/solverbase.h diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 6831eab3d..67e97ffb8 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -16,6 +16,15 @@ namespace Eigen { namespace internal { + template struct traits > + : traits<_MatrixType> + { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; + }; + template struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -48,20 +57,19 @@ namespace internal { * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix TmpMatrixType; typedef Transpositions TranspositionType; @@ -180,6 +188,7 @@ template class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax x = decompositionObject.solve(x) . @@ -197,13 +206,8 @@ template class LDLT */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LDLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template bool solveInPlace(MatrixBase &bAndX) const; @@ -259,6 +263,9 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -559,14 +566,22 @@ template template void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - matrixL().solveInPlace(dst); + // dst = L^-*T (P b) + matrixL().template conjugateIf().solveInPlace(dst); - // dst = D^-1 (L^-1 P b) + // dst = D^-* (L^-1 P b) + // dst = D^-1 (L^-*T P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); @@ -578,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Using numeric_limits::min() gives us more robustness to denormals. RealScalar tolerance = (std::numeric_limits::min)(); - for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -587,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons dst.row(i).setZero(); } - // dst = L^-T (D^-1 L^-1 P b) - matrixU().solveInPlace(dst); + // dst = L^-* (D^-* L^-1 P b) + // dst = L^-T (D^-1 L^-*T P b) + matrixL().transpose().template conjugateIf().solveInPlace(dst); - // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b + // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b + // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 868766365..5876966e6 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -13,6 +13,16 @@ namespace Eigen { namespace internal{ + +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + template struct LLT_Traits; } @@ -54,18 +64,17 @@ template struct LLT_Traits; * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ template class LLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits::size, @@ -129,6 +138,7 @@ template class LLT return Traits::getL(m_matrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -141,13 +151,8 @@ template class LLT */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template void solveInPlace(const MatrixBase &bAndX) const; @@ -205,6 +210,9 @@ template class LLT #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -476,8 +484,17 @@ template template void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - dst = rhs; - solveInPlace(dst); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + dst = rhs; + + matrixL().template conjugateIf().solveInPlace(dst); + matrixU().template conjugateIf().solveInPlace(dst); } #endif diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 702a5485c..055d3ddc1 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -14,8 +14,35 @@ namespace Eigen { namespace internal { +template +struct solve_assertion { + template + static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion(b); } +}; +template +struct solve_assertion > +{ + typedef Transpose type; + template + static void run(const type& transpose, const Rhs& b) + { + internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); + } +}; + +template +struct solve_assertion, const Transpose > > +{ + typedef CwiseUnaryOp, const Transpose > type; + + template + static void run(const type& adjoint, const Rhs& b) + { + internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); + } +}; } // end namespace internal /** \class SolverBase @@ -35,7 +62,7 @@ namespace internal { * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU + * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase */ template class SolverBase : public EigenBase @@ -46,6 +73,9 @@ class SolverBase : public EigenBase typedef typename internal::traits::Scalar Scalar; typedef Scalar CoeffReturnType; + template + friend struct internal::solve_assertion; + enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, @@ -75,7 +105,7 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); + internal::solve_assertion::type>::template run(derived(), b); return Solve(derived(), b.derived()); } @@ -113,6 +143,12 @@ class SolverBase : public EigenBase } protected: + + template + void _check_solve_assertion(const Rhs& b) const { + eigen_assert(derived().m_isInitialized && "Solver is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); + } }; namespace internal { diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 3ab3a5f50..5d86a51ac 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -260,6 +260,7 @@ template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; +template class SVDBase; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index e62befcb6..9318c281f 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -156,6 +156,12 @@ template class HouseholderS Side > TransposeReturnType; + typedef HouseholderSequence< + typename internal::add_const::type, + typename internal::add_const::type, + Side + > ConstHouseholderSequence; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -244,6 +250,18 @@ template class HouseholderS .setShift(m_shift); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); + } + /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ AdjointReturnType adjoint() const { diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index b4f4bc6ee..68930ea53 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -63,6 +63,7 @@ template class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) enum { @@ -218,6 +219,7 @@ template class FullPivLU return internal::image_retval(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -755,7 +753,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -805,7 +802,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index ff4be360e..8726bf895 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -80,6 +80,8 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, @@ -152,6 +154,7 @@ template class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -231,8 +230,6 @@ template class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 1faa3442e..9b677e9bf 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,6 +17,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -46,20 +49,19 @@ template struct traits > * \sa MatrixBase::colPivHouseholderQr() */ template class ColPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type IntRowVectorType; @@ -156,6 +158,7 @@ template class ColPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -172,11 +175,8 @@ template class ColPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -417,6 +417,9 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -583,8 +586,6 @@ template template void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -604,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } + +template +template +void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index nonzero_pivots = nonzeroPivots(); + + if(nonzero_pivots == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); + + m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); + dst.bottomRows(rows()-nonzero_pivots).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf() ); +} #endif namespace internal { diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 03017a375..d62628087 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,6 +16,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -44,19 +47,21 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template -class CompleteOrthogonalDecomposition { +template class CompleteOrthogonalDecomposition + : public SolverBase > +{ public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + + template + friend struct internal::solve_assertion; + + EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; @@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition { m_temp(matrix.cols()) { computeInPlace(); - } - + } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition { */ template inline const Solve solve( - const MatrixBase& b) const { - eigen_assert(m_cpqr.m_isInitialized && - "CompleteOrthogonalDecomposition is not initialized."); - return Solve(*this, b.derived()); - } + const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition { */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZAdjointOnTheLeftInPlace(Z); - return Z.adjoint(); + applyZOnTheLeftInPlace(Z); + return Z; } /** \returns a reference to the matrix where the complete orthogonal @@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition { */ inline const Inverse pseudoInverse() const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse(*this); } @@ -368,6 +371,9 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType& rhs, DstType& dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -375,8 +381,21 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + template + void _check_solve_assertion(const Rhs& b) const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); + } + void computeInPlace(); + /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or + * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate + * is set to \c true. + */ + template + void applyZOnTheLeftInPlace(Rhs& rhs) const; + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template @@ -464,6 +483,28 @@ void CompleteOrthogonalDecomposition::computeInPlace() } } +template +template +void CompleteOrthogonalDecomposition::applyZOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix temp((std::max)(cols, nrhs)); + for (Index k = rank-1; k >= 0; --k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf(), zCoeffs().template conjugateIf()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + template template void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( @@ -471,7 +512,7 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); + Matrix temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -491,8 +532,6 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs.rows() == this->rows()); - const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -520,6 +559,34 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } + +template +template +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = this->rank(); + + if (rank == 0) { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); + + if (rank < cols()) { + applyZOnTheLeftInPlace(c); + } + + matrixT().topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif namespace internal { diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index c31e47cc4..d0664a1d8 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,6 +18,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -55,20 +58,19 @@ struct traits > * \sa MatrixBase::fullPivHouseholderQr() */ template class FullPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix class FullPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -173,11 +176,8 @@ template class FullPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns Expression object representing the matrix Q */ @@ -396,6 +396,9 @@ template class FullPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -498,15 +501,15 @@ void FullPivHouseholderQR::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = i; - m_cols_transpositions.coeffRef(i) = i; + m_rows_transpositions.coeffRef(i) = internal::convert_index(i); + m_cols_transpositions.coeffRef(i) = internal::convert_index(i); m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; - m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; + m_rows_transpositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_cols_transpositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -540,7 +543,6 @@ template template void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -553,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix temp(rhs.cols()); + Matrix temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -570,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } + +template +template +void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index l_rank = rank(); + + if(l_rank == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); + + m_qr.topLeftCorner(l_rank, l_rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(l_rank)); + + dst.topRows(l_rank) = c.topRows(l_rank); + dst.bottomRows(rows()-l_rank).setZero(); + + Matrix temp(dst.cols()); + const Index size = (std::min)(rows(), cols()); + for (Index k = size-1; k >= 0; --k) + { + Index remainingSize = rows()-k; + + dst.bottomRightCorner(remainingSize, dst.cols()) + .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf(), + m_hCoeffs.template conjugateIf().coeff(k), &temp.coeffRef(0)); + + dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); + } +} #endif namespace internal { diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 33cb9c8ff..801739fbd 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -14,6 +14,18 @@ namespace Eigen { +namespace internal { +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + +} // end namespace internal + /** \ingroup QR_Module * * @@ -42,20 +54,19 @@ namespace Eigen { * \sa MatrixBase::householderQr() */ template class HouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; @@ -121,6 +132,7 @@ template class HouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -137,11 +149,8 @@ template class HouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -214,6 +223,9 @@ template class HouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -349,7 +361,6 @@ template void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); - eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); @@ -362,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } + +template +template +void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = (std::min)(rows(), cols()); + + typename RhsType::PlainObject c(rhs); + + m_qr.topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 18d7bdc0a..e3fddacbc 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -39,6 +39,7 @@ namespace internal { template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -1006,7 +1007,7 @@ void BDCSVD::perturbCol0 #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert((std::isfinite)(tmp)); #endif - zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; + zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } } diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 1c7c80376..2b6891105 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 851ad6836..ed1e9f20e 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -17,6 +17,18 @@ #define EIGEN_SVDBASE_H namespace Eigen { + +namespace internal { +template struct traits > + : traits +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; +} + /** \ingroup SVD_Module * * @@ -44,15 +56,18 @@ namespace Eigen { * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template -class SVDBase +template class SVDBase + : public SolverBase > { +public: + + template + friend struct internal::solve_assertion; -public: typedef typename internal::traits::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename Eigen::internal::traits::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -194,6 +209,7 @@ public: inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -205,16 +221,15 @@ public: */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); - return Solve(derived(), b.derived()); - } - + solve(const MatrixBase& b) const; + #endif + #ifndef EIGEN_PARSED_BY_DOXYGEN template void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -223,6 +238,13 @@ protected: { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + + template + void _check_solve_assertion(const Rhs& b) const { + eigen_assert(m_isInitialized && "SVD is not initialized."); + eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); + eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); + } // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; @@ -263,17 +285,30 @@ template template void SVDBase::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix tmp; + Matrix tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } + +template +template +void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + // A = U S V^* + // So A^{-*} = U S^{-1} V^* + // And A^{-T} = U_conj S^{-1} V^T + Matrix tmp; + Index l_rank = rank(); + + tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf() * rhs; + tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; + dst = m_matrixU.template conjugateIf().leftCols(l_rank) * tmp; +} #endif template diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index 3065ff015..85a80d6bb 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -46,6 +46,8 @@ void bdcsvd_method() VERIFY_RAISES_ASSERT(m.bdcSvd().matrixU()); VERIFY_RAISES_ASSERT(m.bdcSvd().matrixV()); VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m); + VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m); + VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); } // compare the Singular values returned with Jacobi and Bdc diff --git a/test/cholesky.cpp b/test/cholesky.cpp index e1e8b7bf7..0b1a7b45b 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -7,15 +7,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_NO_ASSERTION_CHECKING -#define EIGEN_NO_ASSERTION_CHECKING -#endif - #define TEST_ENABLE_TEMPORARY_TRACKING #include "main.h" #include #include +#include "solverbase.h" template typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { @@ -81,15 +78,17 @@ template void cholesky(const MatrixType& m) } { + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + SquareMatrixType symmUp = symm.template triangularView(); SquareMatrixType symmLo = symm.template triangularView(); LLT chollo(symmLo); VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix()); - vecX = chollo.solve(vecB); - VERIFY_IS_APPROX(symm * vecX, vecB); - matX = chollo.solve(matB); - VERIFY_IS_APPROX(symm * matX, matB); + + check_solverbase(symm, chollo, rows, rows, 1); + check_solverbase(symm, chollo, rows, cols, rows); const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols)); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / @@ -143,6 +142,9 @@ template void cholesky(const MatrixType& m) // LDLT { + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + int sign = internal::random()%2 ? 1 : -1; if(sign == -1) @@ -156,10 +158,9 @@ template void cholesky(const MatrixType& m) LDLT ldltlo(symmLo); VERIFY(ldltlo.info()==Success); VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix()); - vecX = ldltlo.solve(vecB); - VERIFY_IS_APPROX(symm * vecX, vecB); - matX = ldltlo.solve(matB); - VERIFY_IS_APPROX(symm * matX, matB); + + check_solverbase(symm, ldltlo, rows, rows, 1); + check_solverbase(symm, ldltlo, rows, cols, rows); const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols)); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / @@ -313,10 +314,9 @@ template void cholesky_cplx(const MatrixType& m) LLT chollo(symmLo); VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix()); - vecX = chollo.solve(vecB); - VERIFY_IS_APPROX(symm * vecX, vecB); -// matX = chollo.solve(matB); -// VERIFY_IS_APPROX(symm * matX, matB); + + check_solverbase(symm, chollo, rows, rows, 1); + //check_solverbase(symm, chollo, rows, cols, rows); } // LDLT @@ -333,10 +333,9 @@ template void cholesky_cplx(const MatrixType& m) LDLT ldltlo(symmLo); VERIFY(ldltlo.info()==Success); VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix()); - vecX = ldltlo.solve(vecB); - VERIFY_IS_APPROX(symm * vecX, vecB); -// matX = ldltlo.solve(matB); -// VERIFY_IS_APPROX(symm * matX, matB); + + check_solverbase(symm, ldltlo, rows, rows, 1); + //check_solverbase(symm, ldltlo, rows, cols, rows); } } @@ -477,16 +476,20 @@ template void cholesky_verify_assert() VERIFY_RAISES_ASSERT(llt.matrixL()) VERIFY_RAISES_ASSERT(llt.matrixU()) VERIFY_RAISES_ASSERT(llt.solve(tmp)) - VERIFY_RAISES_ASSERT(llt.solveInPlace(&tmp)) + VERIFY_RAISES_ASSERT(llt.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(llt.adjoint().solve(tmp)) + VERIFY_RAISES_ASSERT(llt.solveInPlace(tmp)) LDLT ldlt; VERIFY_RAISES_ASSERT(ldlt.matrixL()) - VERIFY_RAISES_ASSERT(ldlt.permutationP()) + VERIFY_RAISES_ASSERT(ldlt.transpositionsP()) VERIFY_RAISES_ASSERT(ldlt.vectorD()) VERIFY_RAISES_ASSERT(ldlt.isPositive()) VERIFY_RAISES_ASSERT(ldlt.isNegative()) VERIFY_RAISES_ASSERT(ldlt.solve(tmp)) - VERIFY_RAISES_ASSERT(ldlt.solveInPlace(&tmp)) + VERIFY_RAISES_ASSERT(ldlt.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(ldlt.adjoint().solve(tmp)) + VERIFY_RAISES_ASSERT(ldlt.solveInPlace(tmp)) } EIGEN_DECLARE_TEST(cholesky) diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 505bf57ae..89484d971 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -67,6 +67,8 @@ void jacobisvd_method() VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixU()); VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixV()); VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m); + VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m); + VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); } namespace Foo { diff --git a/test/lu.cpp b/test/lu.cpp index 46fd60555..bb6ae124b 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -9,6 +9,7 @@ #include "main.h" #include +#include "solverbase.h" using namespace std; template @@ -96,32 +97,14 @@ template void lu_non_invertible() VERIFY(m1image.fullPivLu().rank() == rank); VERIFY_IS_APPROX(m1 * m1.adjoint() * m1image, m1image); + check_solverbase(m1, lu, rows, cols, cols2); + m2 = CMatrixType::Random(cols,cols2); m3 = m1*m2; m2 = CMatrixType::Random(cols,cols2); // test that the code, which does resize(), may be applied to an xpr m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); - - // test solve with transposed - m3 = MatrixType::Random(rows,cols2); - m2 = m1.transpose()*m3; - m3 = MatrixType::Random(rows,cols2); - lu.template _solve_impl_transposed(m2, m3); - VERIFY_IS_APPROX(m2, m1.transpose()*m3); - m3 = MatrixType::Random(rows,cols2); - m3 = lu.transpose().solve(m2); - VERIFY_IS_APPROX(m2, m1.transpose()*m3); - - // test solve with conjugate transposed - m3 = MatrixType::Random(rows,cols2); - m2 = m1.adjoint()*m3; - m3 = MatrixType::Random(rows,cols2); - lu.template _solve_impl_transposed(m2, m3); - VERIFY_IS_APPROX(m2, m1.adjoint()*m3); - m3 = MatrixType::Random(rows,cols2); - m3 = lu.adjoint().solve(m2); - VERIFY_IS_APPROX(m2, m1.adjoint()*m3); } template void lu_invertible() @@ -150,10 +133,12 @@ template void lu_invertible() VERIFY(lu.isSurjective()); VERIFY(lu.isInvertible()); VERIFY(lu.image(m1).fullPivLu().isInvertible()); + + check_solverbase(m1, lu, size, size, size); + + MatrixType m1_inverse = lu.inverse(); m3 = MatrixType::Random(size,size); m2 = lu.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); - MatrixType m1_inverse = lu.inverse(); VERIFY_IS_APPROX(m2, m1_inverse*m3); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); @@ -162,20 +147,6 @@ template void lu_invertible() // truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); - // test solve with transposed - lu.template _solve_impl_transposed(m3, m2); - VERIFY_IS_APPROX(m3, m1.transpose()*m2); - m3 = MatrixType::Random(size,size); - m3 = lu.transpose().solve(m2); - VERIFY_IS_APPROX(m2, m1.transpose()*m3); - - // test solve with conjugate transposed - lu.template _solve_impl_transposed(m3, m2); - VERIFY_IS_APPROX(m3, m1.adjoint()*m2); - m3 = MatrixType::Random(size,size); - m3 = lu.adjoint().solve(m2); - VERIFY_IS_APPROX(m2, m1.adjoint()*m3); - // Regression test for Bug 302 MatrixType m4 = MatrixType::Random(size,size); VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4); @@ -197,30 +168,17 @@ template void lu_partial_piv() VERIFY_IS_APPROX(m1, plu.reconstructedMatrix()); + check_solverbase(m1, plu, size, size, size); + + MatrixType m1_inverse = plu.inverse(); m3 = MatrixType::Random(size,size); m2 = plu.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); - MatrixType m1_inverse = plu.inverse(); VERIFY_IS_APPROX(m2, m1_inverse*m3); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); const RealScalar rcond_est = plu.rcond(); // Verify that the estimate is within a factor of 10 of the truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); - - // test solve with transposed - plu.template _solve_impl_transposed(m3, m2); - VERIFY_IS_APPROX(m3, m1.transpose()*m2); - m3 = MatrixType::Random(size,size); - m3 = plu.transpose().solve(m2); - VERIFY_IS_APPROX(m2, m1.transpose()*m3); - - // test solve with conjugate transposed - plu.template _solve_impl_transposed(m3, m2); - VERIFY_IS_APPROX(m3, m1.adjoint()*m2); - m3 = MatrixType::Random(size,size); - m3 = plu.adjoint().solve(m2); - VERIFY_IS_APPROX(m2, m1.adjoint()*m3); } template void lu_verify_assert() @@ -234,6 +192,8 @@ template void lu_verify_assert() VERIFY_RAISES_ASSERT(lu.kernel()) VERIFY_RAISES_ASSERT(lu.image(tmp)) VERIFY_RAISES_ASSERT(lu.solve(tmp)) + VERIFY_RAISES_ASSERT(lu.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(lu.adjoint().solve(tmp)) VERIFY_RAISES_ASSERT(lu.determinant()) VERIFY_RAISES_ASSERT(lu.rank()) VERIFY_RAISES_ASSERT(lu.dimensionOfKernel()) @@ -246,6 +206,8 @@ template void lu_verify_assert() VERIFY_RAISES_ASSERT(plu.matrixLU()) VERIFY_RAISES_ASSERT(plu.permutationP()) VERIFY_RAISES_ASSERT(plu.solve(tmp)) + VERIFY_RAISES_ASSERT(plu.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(plu.adjoint().solve(tmp)) VERIFY_RAISES_ASSERT(plu.determinant()) VERIFY_RAISES_ASSERT(plu.inverse()) } diff --git a/test/qr.cpp b/test/qr.cpp index 4799aa9ef..c38e3439b 100644 --- a/test/qr.cpp +++ b/test/qr.cpp @@ -9,6 +9,7 @@ #include "main.h" #include +#include "solverbase.h" template void qr(const MatrixType& m) { @@ -41,11 +42,7 @@ template void qr_fixedsize() VERIFY_IS_APPROX(m1, qr.householderQ() * r); - Matrix m2 = Matrix::Random(Cols,Cols2); - Matrix m3 = m1*m2; - m2 = Matrix::Random(Cols,Cols2); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + check_solverbase, Matrix >(m1, qr, Rows, Cols, Cols2); } template void qr_invertible() @@ -57,6 +54,8 @@ template void qr_invertible() typedef typename NumTraits::Real RealScalar; typedef typename MatrixType::Scalar Scalar; + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + int size = internal::random(10,50); MatrixType m1(size, size), m2(size, size), m3(size, size); @@ -70,9 +69,8 @@ template void qr_invertible() } HouseholderQR qr(m1); - m3 = MatrixType::Random(size,size); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + + check_solverbase(m1, qr, size, size, size); // now construct a matrix with prescribed determinant m1.setZero(); @@ -95,6 +93,8 @@ template void qr_verify_assert() HouseholderQR qr; VERIFY_RAISES_ASSERT(qr.matrixQR()) VERIFY_RAISES_ASSERT(qr.solve(tmp)) + VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp)) VERIFY_RAISES_ASSERT(qr.householderQ()) VERIFY_RAISES_ASSERT(qr.absDeterminant()) VERIFY_RAISES_ASSERT(qr.logAbsDeterminant()) diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index d224a9436..a563b5470 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -11,9 +11,12 @@ #include "main.h" #include #include +#include "solverbase.h" template void cod() { + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + Index rows = internal::random(2, EIGEN_TEST_MAX_SIZE); Index cols = internal::random(2, EIGEN_TEST_MAX_SIZE); Index cols2 = internal::random(2, EIGEN_TEST_MAX_SIZE); @@ -46,12 +49,12 @@ void cod() { MatrixType c = q * t * z * cod.colsPermutation().inverse(); VERIFY_IS_APPROX(matrix, c); + check_solverbase(matrix, cod, rows, cols, cols2); + + // Verify that we get the same minimum-norm solution as the SVD. MatrixType exact_solution = MatrixType::Random(cols, cols2); MatrixType rhs = matrix * exact_solution; MatrixType cod_solution = cod.solve(rhs); - VERIFY_IS_APPROX(rhs, matrix * cod_solution); - - // Verify that we get the same minimum-norm solution as the SVD. JacobiSVD svd(matrix, ComputeThinU | ComputeThinV); MatrixType svd_solution = svd.solve(rhs); VERIFY_IS_APPROX(cod_solution, svd_solution); @@ -77,13 +80,13 @@ void cod_fixedsize() { VERIFY(cod.isSurjective() == (rank == Cols)); VERIFY(cod.isInvertible() == (cod.isInjective() && cod.isSurjective())); + check_solverbase, Matrix >(matrix, cod, Rows, Cols, Cols2); + + // Verify that we get the same minimum-norm solution as the SVD. Matrix exact_solution; exact_solution.setRandom(Cols, Cols2); Matrix rhs = matrix * exact_solution; Matrix cod_solution = cod.solve(rhs); - VERIFY_IS_APPROX(rhs, matrix * cod_solution); - - // Verify that we get the same minimum-norm solution as the SVD. JacobiSVD svd(matrix, ComputeFullU | ComputeFullV); Matrix svd_solution = svd.solve(rhs); VERIFY_IS_APPROX(cod_solution, svd_solution); @@ -93,6 +96,8 @@ template void qr() { using std::sqrt; + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + Index rows = internal::random(2,EIGEN_TEST_MAX_SIZE), cols = internal::random(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random(2,EIGEN_TEST_MAX_SIZE); Index rank = internal::random(1, (std::min)(rows, cols)-1); @@ -133,13 +138,10 @@ template void qr() VERIFY_IS_APPROX_OR_LESS_THAN(y, x); } - MatrixType m2 = MatrixType::Random(cols,cols2); - MatrixType m3 = m1*m2; - m2 = MatrixType::Random(cols,cols2); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + check_solverbase(m1, qr, rows, cols, cols2); { + MatrixType m2, m3; Index size = rows; do { m1 = MatrixType::Random(size,size); @@ -173,11 +175,8 @@ template void qr_fixedsize() Matrix c = qr.householderQ() * r * qr.colsPermutation().inverse(); VERIFY_IS_APPROX(m1, c); - Matrix m2 = Matrix::Random(Cols,Cols2); - Matrix m3 = m1*m2; - m2 = Matrix::Random(Cols,Cols2); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + check_solverbase, Matrix >(m1, qr, Rows, Cols, Cols2); + // Verify that the absolute value of the diagonal elements in R are // non-increasing until they reache the singularity threshold. RealScalar threshold = @@ -264,9 +263,8 @@ template void qr_invertible() } ColPivHouseholderQR qr(m1); - m3 = MatrixType::Random(size,size); - m2 = qr.solve(m3); - //VERIFY_IS_APPROX(m3, m1*m2); + + check_solverbase(m1, qr, size, size, size); // now construct a matrix with prescribed determinant m1.setZero(); @@ -286,6 +284,8 @@ template void qr_verify_assert() ColPivHouseholderQR qr; VERIFY_RAISES_ASSERT(qr.matrixQR()) VERIFY_RAISES_ASSERT(qr.solve(tmp)) + VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp)) VERIFY_RAISES_ASSERT(qr.householderQ()) VERIFY_RAISES_ASSERT(qr.dimensionOfKernel()) VERIFY_RAISES_ASSERT(qr.isInjective()) @@ -296,6 +296,25 @@ template void qr_verify_assert() VERIFY_RAISES_ASSERT(qr.logAbsDeterminant()) } +template void cod_verify_assert() +{ + MatrixType tmp; + + CompleteOrthogonalDecomposition cod; + VERIFY_RAISES_ASSERT(cod.matrixQTZ()) + VERIFY_RAISES_ASSERT(cod.solve(tmp)) + VERIFY_RAISES_ASSERT(cod.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(cod.adjoint().solve(tmp)) + VERIFY_RAISES_ASSERT(cod.householderQ()) + VERIFY_RAISES_ASSERT(cod.dimensionOfKernel()) + VERIFY_RAISES_ASSERT(cod.isInjective()) + VERIFY_RAISES_ASSERT(cod.isSurjective()) + VERIFY_RAISES_ASSERT(cod.isInvertible()) + VERIFY_RAISES_ASSERT(cod.pseudoInverse()) + VERIFY_RAISES_ASSERT(cod.absDeterminant()) + VERIFY_RAISES_ASSERT(cod.logAbsDeterminant()) +} + EIGEN_DECLARE_TEST(qr_colpivoting) { for(int i = 0; i < g_repeat; i++) { @@ -330,6 +349,13 @@ EIGEN_DECLARE_TEST(qr_colpivoting) CALL_SUBTEST_6(qr_verify_assert()); CALL_SUBTEST_3(qr_verify_assert()); + CALL_SUBTEST_7(cod_verify_assert()); + CALL_SUBTEST_8(cod_verify_assert()); + CALL_SUBTEST_1(cod_verify_assert()); + CALL_SUBTEST_2(cod_verify_assert()); + CALL_SUBTEST_6(cod_verify_assert()); + CALL_SUBTEST_3(cod_verify_assert()); + // Test problem size constructors CALL_SUBTEST_9(ColPivHouseholderQR(10, 20)); diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp index 150b4256c..f2d8cb33e 100644 --- a/test/qr_fullpivoting.cpp +++ b/test/qr_fullpivoting.cpp @@ -10,9 +10,12 @@ #include "main.h" #include +#include "solverbase.h" template void qr() { + STATIC_CHECK(( internal::is_same::StorageIndex,int>::value )); + static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime; Index max_size = EIGEN_TEST_MAX_SIZE; Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10); @@ -48,13 +51,10 @@ template void qr() MatrixType tmp; VERIFY_IS_APPROX(tmp.noalias() = qr.matrixQ() * r, (qr.matrixQ() * r).eval()); - MatrixType m2 = MatrixType::Random(cols,cols2); - MatrixType m3 = m1*m2; - m2 = MatrixType::Random(cols,cols2); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + check_solverbase(m1, qr, rows, cols, cols2); { + MatrixType m2, m3; Index size = rows; do { m1 = MatrixType::Random(size,size); @@ -93,9 +93,7 @@ template void qr_invertible() VERIFY(qr.isInvertible()); VERIFY(qr.isSurjective()); - m3 = MatrixType::Random(size,size); - m2 = qr.solve(m3); - VERIFY_IS_APPROX(m3, m1*m2); + check_solverbase(m1, qr, size, size, size); // now construct a matrix with prescribed determinant m1.setZero(); @@ -115,6 +113,8 @@ template void qr_verify_assert() FullPivHouseholderQR qr; VERIFY_RAISES_ASSERT(qr.matrixQR()) VERIFY_RAISES_ASSERT(qr.solve(tmp)) + VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp)) + VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp)) VERIFY_RAISES_ASSERT(qr.matrixQ()) VERIFY_RAISES_ASSERT(qr.dimensionOfKernel()) VERIFY_RAISES_ASSERT(qr.isInjective()) diff --git a/test/solverbase.h b/test/solverbase.h new file mode 100644 index 000000000..13c09593a --- /dev/null +++ b/test/solverbase.h @@ -0,0 +1,36 @@ +#ifndef TEST_SOLVERBASE_H +#define TEST_SOLVERBASE_H + +template +void check_solverbase(const MatrixType& matrix, const SolverType& solver, Index rows, Index cols, Index cols2) +{ + // solve + DstType m2 = DstType::Random(cols,cols2); + RhsType m3 = matrix*m2; + DstType solver_solution = DstType::Random(cols,cols2); + solver._solve_impl(m3, solver_solution); + VERIFY_IS_APPROX(m3, matrix*solver_solution); + solver_solution = DstType::Random(cols,cols2); + solver_solution = solver.solve(m3); + VERIFY_IS_APPROX(m3, matrix*solver_solution); + // test solve with transposed + m3 = RhsType::Random(rows,cols2); + m2 = matrix.transpose()*m3; + RhsType solver_solution2 = RhsType::Random(rows,cols2); + solver.template _solve_impl_transposed(m2, solver_solution2); + VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2); + solver_solution2 = RhsType::Random(rows,cols2); + solver_solution2 = solver.transpose().solve(m2); + VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2); + // test solve with conjugate transposed + m3 = RhsType::Random(rows,cols2); + m2 = matrix.adjoint()*m3; + solver_solution2 = RhsType::Random(rows,cols2); + solver.template _solve_impl_transposed(m2, solver_solution2); + VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2); + solver_solution2 = RhsType::Random(rows,cols2); + solver_solution2 = solver.adjoint().solve(m2); + VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2); +} + +#endif // TEST_SOLVERBASE_H diff --git a/test/svd_common.h b/test/svd_common.h index cba066593..5c0f2a0e4 100644 --- a/test/svd_common.h +++ b/test/svd_common.h @@ -17,6 +17,7 @@ #endif #include "svd_fill.h" +#include "solverbase.h" // Check that the matrix m is properly reconstructed and that the U and V factors are unitary // The SVD must have already been computed. @@ -219,12 +220,33 @@ void svd_min_norm(const MatrixType& m, unsigned int computationOptions) VERIFY_IS_APPROX(x21, x3); } +template +void svd_test_solvers(const MatrixType& m, const SolverType& solver) { + Index rows, cols, cols2; + + rows = m.rows(); + cols = m.cols(); + + if(MatrixType::ColsAtCompileTime==Dynamic) + { + cols2 = internal::random(2,EIGEN_TEST_MAX_SIZE); + } + else + { + cols2 = cols; + } + typedef Matrix CMatrixType; + check_solverbase(m, solver, rows, cols, cols2); +} + // Check full, compare_to_full, least_square, and min_norm for all possible compute-options template void svd_test_all_computation_options(const MatrixType& m, bool full_only) { // if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols()) // return; + STATIC_CHECK(( internal::is_same::value )); + SvdType fullSvd(m, ComputeFullU|ComputeFullV); CALL_SUBTEST(( svd_check_full(m, fullSvd) )); CALL_SUBTEST(( svd_least_square(m, ComputeFullU | ComputeFullV) )); @@ -234,6 +256,9 @@ void svd_test_all_computation_options(const MatrixType& m, bool full_only) // remark #111: statement is unreachable #pragma warning disable 111 #endif + + svd_test_solvers(m, fullSvd); + if(full_only) return; @@ -448,6 +473,8 @@ void svd_verify_assert(const MatrixType& m) VERIFY_RAISES_ASSERT(svd.singularValues()) VERIFY_RAISES_ASSERT(svd.matrixV()) VERIFY_RAISES_ASSERT(svd.solve(rhs)) + VERIFY_RAISES_ASSERT(svd.transpose().solve(rhs)) + VERIFY_RAISES_ASSERT(svd.adjoint().solve(rhs)) MatrixType a = MatrixType::Zero(rows, cols); a.setZero(); svd.compute(a, 0); From bba2f05064005b9d183fc5e8566b7290b9791827 Mon Sep 17 00:00:00 2001 From: Patrick Peltzer Date: Thu, 17 Jan 2019 11:54:37 +0100 Subject: [PATCH 155/295] Boosttest only available for Boost version >= 1.53.0 --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 67b35a61e..6213a6a65 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -359,7 +359,7 @@ if(EIGEN_TEST_EIGEN2) endif() # boost MP unit test -find_package(Boost) +find_package(Boost 1.53.0) if(Boost_FOUND) include_directories(${Boost_INCLUDE_DIRS}) ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}") From be05d0030d7c2a83a2cc924d9c3aae6ad81cda4f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 12:01:00 +0100 Subject: [PATCH 156/295] Make FullPivLU use conjugateIf<> --- Eigen/src/LU/FullPivLU.h | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 68930ea53..ef93ec5eb 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -815,29 +815,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); From b57c9787b1b7c7436f95814cab6e3551f49dda6f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 16:55:42 +0100 Subject: [PATCH 157/295] Cleanup useless const_cast and add missing broadcast assignment tests --- Eigen/src/Core/VectorwiseOp.h | 10 +++++----- test/vectorwiseop.cpp | 13 +++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index ea0a092a5..db0b9f8c4 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -557,7 +557,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return const_cast(m_matrix = extendedTo(other.derived())); + return m_matrix = extendedTo(other.derived()); } /** Adds the vector \a other to each subvector of \c *this */ @@ -567,7 +567,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix += extendedTo(other.derived())); + return m_matrix += extendedTo(other.derived()); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -577,7 +577,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix -= extendedTo(other.derived())); + return m_matrix -= extendedTo(other.derived()); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -589,7 +589,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Divides each subvector of \c *this by the vector \a other */ @@ -601,7 +601,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 4b9d2d570..8ee58841a 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -150,6 +150,19 @@ template void vectorwiseop_matrix(const MatrixType& m) RealColVectorType rcres; RealRowVectorType rrres; + // test broadcast assignment + m2 = m1; + m2.colwise() = colvec; + for(Index j=0; j1) + VERIFY_RAISES_ASSERT(m2.colwise() = colvec.transpose()); + if(cols>1) + VERIFY_RAISES_ASSERT(m2.rowwise() = rowvec.transpose()); + // test addition m2 = m1; From 4b7cf7ff82a5bfa252dd2e00b449073272482d65 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 17:35:32 +0100 Subject: [PATCH 158/295] Extend reshaped unit tests and remove useless const_cast --- Eigen/src/Core/Reshaped.h | 2 +- test/reshape.cpp | 48 +++++++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index b7bd1b292..c955815e6 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h @@ -191,7 +191,7 @@ class ReshapedImpl_dense /** \returns the nested expression */ EIGEN_DEVICE_FUNC typename internal::remove_reference::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + nestedExpression() { return m_xpr; } protected: diff --git a/test/reshape.cpp b/test/reshape.cpp index 14a02bb3b..7b16742a2 100644 --- a/test/reshape.cpp +++ b/test/reshape.cpp @@ -49,10 +49,25 @@ void check_auto_reshape4x4(MatType m) VERIFY(is_same_eq(m.template reshaped(AutoSize, fix< 1> ), m.template reshaped(v16, fix< 1>))); } +template +void check_direct_access_reshape4x4(MatType , internal::FixedInt) {} + +template +void check_direct_access_reshape4x4(MatType m, internal::FixedInt<0>) { + VERIFY_IS_EQUAL(m.reshaped( 1, 16).data(), m.data()); + VERIFY_IS_EQUAL(m.reshaped( 1, 16).innerStride(), 1); + + VERIFY_IS_EQUAL(m.reshaped( 2, 8).data(), m.data()); + VERIFY_IS_EQUAL(m.reshaped( 2, 8).innerStride(), 1); + VERIFY_IS_EQUAL(m.reshaped( 2, 8).outerStride(), 2); +} + // just test a 4x4 matrix, enumerate all combination manually template void reshape4x4(MatType m) { + typedef typename MatType::Scalar Scalar; + internal::VariableAndFixedInt v1( 1); internal::VariableAndFixedInt v2( 2); internal::VariableAndFixedInt v4( 4); @@ -124,12 +139,7 @@ void reshape4x4(MatType m) check_auto_reshape4x4 (m.transpose()); check_auto_reshape4x4(m.transpose()); - VERIFY_IS_EQUAL(m.reshaped( 1, 16).data(), m.data()); - VERIFY_IS_EQUAL(m.reshaped( 1, 16).innerStride(), 1); - - VERIFY_IS_EQUAL(m.reshaped( 2, 8).data(), m.data()); - VERIFY_IS_EQUAL(m.reshaped( 2, 8).innerStride(), 1); - VERIFY_IS_EQUAL(m.reshaped( 2, 8).outerStride(), 2); + check_direct_access_reshape4x4(m,fix); if((MatType::Flags&RowMajorBit)==0) { @@ -150,8 +160,8 @@ void reshape4x4(MatType m) VERIFY_IS_EQUAL( m28r1, m28r2); VERIFY(is_same_eq(m.reshaped(v16,fix<1>), m.reshaped())); - VERIFY_IS_EQUAL(m.reshaped(16,1), m.reshaped()); - VERIFY_IS_EQUAL(m.reshaped(1,16), m.reshaped().transpose()); + VERIFY_IS_EQUAL(m.reshaped(16,1).eval(), m.reshaped().eval()); + VERIFY_IS_EQUAL(m.reshaped(1,16).eval(), m.reshaped().transpose().eval()); VERIFY_IS_EQUAL(m.reshaped().reshaped(2,8), m.reshaped(2,8)); VERIFY_IS_EQUAL(m.reshaped().reshaped(4,4), m.reshaped(4,4)); VERIFY_IS_EQUAL(m.reshaped().reshaped(8,2), m.reshaped(8,2)); @@ -163,12 +173,30 @@ void reshape4x4(MatType m) VERIFY(is_same_eq(m.reshaped(AutoSize,fix<1>), m.reshaped())); VERIFY_IS_EQUAL(m.template reshaped(fix<1>,AutoSize), m.transpose().reshaped().transpose()); + + // check assignment + { + Matrix m1x(m.size()); m1x.setRandom(); + VERIFY_IS_APPROX(m.reshaped() = m1x, m1x); + VERIFY_IS_APPROX(m, m1x.reshaped(4,4)); + + Matrix m28(2,8); m28.setRandom(); + VERIFY_IS_APPROX(m.reshaped(2,8) = m28, m28); + VERIFY_IS_APPROX(m, m28.reshaped(4,4)); + VERIFY_IS_APPROX(m.template reshaped(2,8) = m28, m28); + + Matrix m24(2,4); m24.setRandom(); + VERIFY_IS_APPROX(m(seq(0,last,2),all).reshaped(2,4) = m24, m24); + + // check constness: + m.reshaped(2,8).nestedExpression() = m; + } } EIGEN_DECLARE_TEST(reshape) { - typedef Matrix RowMatrixXi; - typedef Matrix RowMatrix4i; + typedef Matrix RowMatrixXi; + typedef Matrix RowMatrix4i; MatrixXi mx = MatrixXi::Random(4, 4); Matrix4i m4 = Matrix4i::Random(4, 4); RowMatrixXi rmx = RowMatrixXi::Random(4, 4); From 0fe6b7d687430fd1fe2d390da2f09fcb8ddc8093 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 18:27:25 +0100 Subject: [PATCH 159/295] Make nestByValue works again (broken since 3.3) and add unit tests. --- Eigen/src/Core/NestByValue.h | 69 ++++++++++++------------------------ test/CMakeLists.txt | 1 + test/nestbyvalue.cpp | 37 +++++++++++++++++++ 3 files changed, 60 insertions(+), 47 deletions(-) create mode 100644 test/nestbyvalue.cpp diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 01cf192e9..239bbba63 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -16,7 +16,11 @@ namespace Eigen { namespace internal { template struct traits > : public traits -{}; +{ + enum { + Flags = traits::Flags & ~NestByRefBit + }; +}; } /** \class NestByValue @@ -43,55 +47,11 @@ template class NestByValue EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const - { - return m_expression.coeff(row, col); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) - { - return m_expression.const_cast_derived().coeffRef(row, col); - } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const - { - return m_expression.coeff(index); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) - { - return m_expression.const_cast_derived().coeffRef(index); - } - - template - EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const - { - return m_expression.template packet(row, col); - } - - template - EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(row, col, x); - } - - template - EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const - { - return m_expression.template packet(index); - } - - template - EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(index, x); - } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } + EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + protected: const ExpressionType m_expression; }; @@ -105,6 +65,21 @@ DenseBase::nestByValue() const return NestByValue(derived()); } +namespace internal { + +// Evaluator of Solve -> eval into a temporary +template +struct evaluator > + : public evaluator +{ + typedef evaluator Base; + + EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue& xpr) + : Base(xpr.nestedExpression()) + {} +}; +} + } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6213a6a65..794befa69 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -268,6 +268,7 @@ ei_add_test(sparselu) ei_add_test(sparseqr) ei_add_test(umeyama) ei_add_test(nesting_ops "${CMAKE_CXX_FLAGS_DEBUG}") +ei_add_test(nestbyvalue) ei_add_test(zerosized) ei_add_test(dontalign) ei_add_test(evaluators) diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp new file mode 100644 index 000000000..c5356bc24 --- /dev/null +++ b/test/nestbyvalue.cpp @@ -0,0 +1,37 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define TEST_ENABLE_TEMPORARY_TRACKING + +#include "main.h" + +typedef NestByValue CpyMatrixXd; +typedef CwiseBinaryOp,const CpyMatrixXd,const CpyMatrixXd> XprType; + +XprType get_xpr_with_temps(const MatrixXd& a) +{ + MatrixXd t1 = a.rowwise().reverse(); + MatrixXd t2 = a+a; + return t1.nestByValue() + t2.nestByValue(); +} + +EIGEN_DECLARE_TEST(nestbyvalue) +{ + for(int i = 0; i < g_repeat; i++) { + Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); + Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); + MatrixXd a = MatrixXd(rows,cols); + nb_temporaries = 0; + XprType x = get_xpr_with_temps(a); + VERIFY_IS_EQUAL(nb_temporaries,6); + MatrixXd b = x; + VERIFY_IS_EQUAL(nb_temporaries,6+1); + VERIFY_IS_APPROX(b, a.rowwise().reverse().eval() + (a+a).eval()); + } +} From ee3662abc57230dbe7bf6e5259f335f7fa4750f1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Jan 2019 18:27:49 +0100 Subject: [PATCH 160/295] Remove some useless const_cast --- Eigen/src/Core/CwiseUnaryView.h | 2 +- Eigen/src/Core/IndexedView.h | 2 +- Eigen/src/Core/Reverse.h | 2 +- Eigen/src/Core/Solve.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 271033056..21cf5ea9e 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + nestedExpression() { return m_matrix; } protected: MatrixTypeNested m_matrix; diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 3485d8f46..377f8a5cc 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -132,7 +132,7 @@ public: /** \returns the nested expression */ typename internal::remove_reference::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + nestedExpression() { return m_xpr; } /** \returns a const reference to the object storing/generating the row indices */ const RowIndices& rowIndices() const { return m_rowIndices; } diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 8b6b3ab03..711dbcf9a 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -203,7 +203,7 @@ struct vectorwise_reverse_inplace_impl template EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); + internal::vectorwise_reverse_inplace_impl::run(m_matrix); } } // end namespace Eigen diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h index 2bf940a26..ec4b4a987 100644 --- a/Eigen/src/Core/Solve.h +++ b/Eigen/src/Core/Solve.h @@ -19,7 +19,7 @@ template class S * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposion object + * \tparam Decomposition the type of the matrix or decomposition object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) From d575505d2589877a9d712feb0d455704d3834f12 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 17 Jan 2019 19:14:07 +0100 Subject: [PATCH 161/295] After fixing bug #1557, boostmultiprec_7 failed with NumericalIssue instead of NoConvergence (all that matters here is no Success) --- test/eigensolver_generic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index cfb31e49e..7adb98665 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -76,7 +76,7 @@ template void eigensolver(const MatrixType& m) // Test matrix with NaN a(0,0) = std::numeric_limits::quiet_NaN(); EigenSolver eiNaN(a); - VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence); + VERIFY_IS_NOT_EQUAL(eiNaN.info(), Success); } // regression test for bug 1098 From 2eccbaf3f73f34a2bac3420377ea844358dfaf5a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 17 Jan 2019 17:45:08 -0800 Subject: [PATCH 162/295] Add missing logical packet ops for GPU and NEON. --- Eigen/src/Core/arch/GPU/PacketMath.h | 111 ++++++++++++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 + 2 files changed, 113 insertions(+) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ddf37b9c1..e3b2d56ec 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -100,6 +100,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do return make_double2(from, from); } +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) | __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) | + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) ^ + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + ~__double_as_longlong(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, + const float& b) { + return __int_as_float(a == b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, + const double& b) { + return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); +} + +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, + const float4& b) { + return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), + bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand(const double2& a, + const double2& b) { + return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por(const float4& a, + const float4& b) { + return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), + bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por(const double2& a, + const double2& b) { + return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor(const float4& a, + const float4& b) { + return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), + bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor(const double2& a, + const double2& b) { + return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot(const float4& a, + const float4& b) { + return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), + bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pandnot(const double2& a, const double2& b) { + return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq(const float4& a, + const float4& b) { + return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), + eq_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_eq(const double2& a, const double2& b) { + return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { return make_float4(a, a+1, a+2, a+3); diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index a8a7b63c9..76e6b3966 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -711,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, con return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); } + template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } From da0a41b9ce5cd0482c6247f510f8c4ee08cff9ba Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 18 Jan 2019 10:41:14 +0100 Subject: [PATCH 163/295] Mask unused-parameter warnings, when building with NDEBUG --- Eigen/src/Core/SolverBase.h | 1 + Eigen/src/QR/CompleteOrthogonalDecomposition.h | 1 + Eigen/src/SVD/SVDBase.h | 1 + 3 files changed, 3 insertions(+) diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 055d3ddc1..501461042 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -146,6 +146,7 @@ class SolverBase : public EigenBase template void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); eigen_assert(derived().m_isInitialized && "Solver is not initialized."); eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); } diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index d62628087..2fc3c871a 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -383,6 +383,7 @@ template class CompleteOrthogonalDecomposition template void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); } diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index ed1e9f20e..68df48921 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -241,6 +241,7 @@ protected: template void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); eigen_assert(m_isInitialized && "SVD is not initialized."); eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); From d18f49cbb37d1e4755e7ebfafe9bda949156355c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 Jan 2019 11:12:42 +0100 Subject: [PATCH 164/295] Fix compilation of unit tests with gcc and c++17 --- test/main.h | 1 + 1 file changed, 1 insertion(+) diff --git a/test/main.h b/test/main.h index 9c1465e9a..a8226ab19 100644 --- a/test/main.h +++ b/test/main.h @@ -17,6 +17,7 @@ #include #include #include +#include // The following includes of STL headers have to be done _before_ the // definition of macros min() and max(). The reason is that many STL From 92774f02758d16eff9d6b73611566fd42eb865f4 Mon Sep 17 00:00:00 2001 From: nluehr Date: Fri, 18 Jan 2019 16:10:09 -0600 Subject: [PATCH 165/295] Replace host_define.h with cuda_runtime_api.h --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index b4d423cb0..68765d4b2 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -386,7 +386,7 @@ #endif #if defined(EIGEN_HAS_CUDA_FP16) - #include + #include #include #endif From 543529da6a1eabf415f4f8b56495fad76b57ba22 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Jan 2019 15:30:50 +0100 Subject: [PATCH 166/295] Add more extensive tests of Array ctors, including {} variants --- test/array_cwise.cpp | 83 +++++++++++++++++++++++++++++++++++++------- test/main.h | 9 +++-- 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index fbc63a81d..9e4adb701 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -92,15 +92,30 @@ template void array(const ArrayType& m) ArrayType::RowsAtCompileTime==Dynamic?2:ArrayType::RowsAtCompileTime, ArrayType::ColsAtCompileTime==Dynamic?2:ArrayType::ColsAtCompileTime, ArrayType::Options> FixedArrayType; - FixedArrayType f1(s1); - VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1)); - FixedArrayType f2(numext::real(s1)); - VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1))); - FixedArrayType f3((int)100*numext::real(s1)); - VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1))); - f1.setRandom(); - FixedArrayType f4(f1.data()); - VERIFY_IS_APPROX(f4, f1); + { + FixedArrayType f1(s1); + VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1)); + FixedArrayType f2(numext::real(s1)); + VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1))); + FixedArrayType f3((int)100*numext::real(s1)); + VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1))); + f1.setRandom(); + FixedArrayType f4(f1.data()); + VERIFY_IS_APPROX(f4, f1); + } + #if EIGEN_HAS_CXX11 + { + FixedArrayType f1{s1}; + VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1)); + FixedArrayType f2{numext::real(s1)}; + VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1))); + FixedArrayType f3{(int)100*numext::real(s1)}; + VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1))); + f1.setRandom(); + FixedArrayType f4{f1.data()}; + VERIFY_IS_APPROX(f4, f1); + } + #endif // pow VERIFY_IS_APPROX(m1.pow(2), m1.square()); @@ -120,10 +135,51 @@ template void array(const ArrayType& m) // Check possible conflicts with 1D ctor typedef Array OneDArrayType; - OneDArrayType o1(rows); - VERIFY(o1.size()==rows); - OneDArrayType o4((int)rows); - VERIFY(o4.size()==rows); + { + OneDArrayType o1(rows); + VERIFY(o1.size()==rows); + OneDArrayType o2(static_cast(rows)); + VERIFY(o2.size()==rows); + } + #if EIGEN_HAS_CXX11 + { + OneDArrayType o1{rows}; + VERIFY(o1.size()==rows); + OneDArrayType o4{int(rows)}; + VERIFY(o4.size()==rows); + } + #endif + // Check possible conflicts with 2D ctor + typedef Array TwoDArrayType; + typedef Array ArrayType2; + { + TwoDArrayType o1(rows,cols); + VERIFY(o1.rows()==rows); + VERIFY(o1.cols()==cols); + TwoDArrayType o2(static_cast(rows),static_cast(cols)); + VERIFY(o2.rows()==rows); + VERIFY(o2.cols()==cols); + + ArrayType2 o3(rows,cols); + VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols)); + ArrayType2 o4(static_cast(rows),static_cast(cols)); + VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols)); + } + #if EIGEN_HAS_CXX11 + { + TwoDArrayType o1{rows,cols}; + VERIFY(o1.rows()==rows); + VERIFY(o1.cols()==cols); + TwoDArrayType o2{int(rows),int(cols)}; + VERIFY(o2.rows()==rows); + VERIFY(o2.cols()==cols); + + ArrayType2 o3{rows,cols}; + VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols)); + ArrayType2 o4{int(rows),int(cols)}; + VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols)); + } + #endif } template void comparisons(const ArrayType& m) @@ -467,6 +523,7 @@ EIGEN_DECLARE_TEST(array_cwise) CALL_SUBTEST_4( array(ArrayXXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_5( array(ArrayXXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( array(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( comparisons(Array()) ); diff --git a/test/main.h b/test/main.h index a8226ab19..8a68a84ee 100644 --- a/test/main.h +++ b/test/main.h @@ -411,8 +411,6 @@ inline bool test_isApprox(const unsigned short& a, const unsigned short& b) { return internal::isApprox(a, b, test_precision()); } inline bool test_isApprox(const unsigned int& a, const unsigned int& b) { return internal::isApprox(a, b, test_precision()); } -inline bool test_isApprox(const long& a, const long& b) -{ return internal::isApprox(a, b, test_precision()); } inline bool test_isApprox(const unsigned long& a, const unsigned long& b) { return internal::isApprox(a, b, test_precision()); } @@ -423,6 +421,13 @@ inline bool test_isMuchSmallerThan(const int& a, const int& b) inline bool test_isApproxOrLessThan(const int& a, const int& b) { return internal::isApproxOrLessThan(a, b, test_precision()); } +inline bool test_isApprox(const long& a, const long& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isMuchSmallerThan(const long& a, const long b) +{ return internal::isMuchSmallerThan(a, b, test_precision()); } +inline bool test_isApproxOrLessThan(const long& a, const long& b) +{ return internal::isApproxOrLessThan(a, b, test_precision()); } + inline bool test_isApprox(const float& a, const float& b) { return internal::isApprox(a, b, test_precision()); } inline bool test_isMuchSmallerThan(const float& a, const float& b) From db152b9ee6effd3799f70a621f495c427cb3c33f Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Mon, 21 Jan 2019 16:25:57 +0100 Subject: [PATCH 167/295] PR 572: Add initializer list constructors to Matrix and Array (include unit tests and doc) - {1,2,3,4,5,...} for fixed-size vectors only - {{1,2,3},{4,5,6}} for the general cases - {{1,2,3,4,5,....}} is allowed for both row and column-vector --- Eigen/src/Core/Array.h | 48 ++- Eigen/src/Core/Matrix.h | 50 ++- Eigen/src/Core/PlainObjectBase.h | 71 ++++ doc/TutorialMatrixClass.dox | 29 +- .../Array_initializer_list2_cxx11.cpp | 3 + doc/snippets/Array_initializer_list_cxx11.cpp | 6 + .../Matrix_initializer_list2_cxx11.cpp | 3 + .../Matrix_initializer_list_cxx11.cpp | 6 + ...s.cpp => Tutorial_std_sort_rows_cxx11.cpp} | 0 failtest/CMakeLists.txt | 5 + failtest/initializer_list_1.cpp | 14 + failtest/initializer_list_2.cpp | 16 + test/CMakeLists.txt | 3 + test/initializer_list_construction.cpp | 371 ++++++++++++++++++ 14 files changed, 622 insertions(+), 3 deletions(-) create mode 100644 doc/snippets/Array_initializer_list2_cxx11.cpp create mode 100644 doc/snippets/Array_initializer_list_cxx11.cpp create mode 100644 doc/snippets/Matrix_initializer_list2_cxx11.cpp create mode 100644 doc/snippets/Matrix_initializer_list_cxx11.cpp rename doc/snippets/{Tutorial_std_sort_rows.cpp => Tutorial_std_sort_rows_cxx11.cpp} (100%) create mode 100644 failtest/initializer_list_1.cpp create mode 100644 failtest/initializer_list_2.cpp create mode 100644 test/initializer_list_construction.cpp diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 16770fc7b..7ef37de7c 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -178,6 +178,20 @@ class Array Base::_check_template_params(); this->template _init2(val0, val1); } + + #if EIGEN_HAS_CXX11 + template + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list, + typename internal::enable_if::value, T>::type* = 0, + typename internal::enable_if::type* = 0) : Base(list) {} + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list >& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -199,7 +213,39 @@ class Array Array(Index rows, Index cols); /** constructs an initialized 2D vector with given coefficients */ Array(const Scalar& val0, const Scalar& val1); - #endif + + /** \copydoc PlainObjectBase::PlainObjectBase(const std::initializer_list& list) + * + * Example: \include Array_initializer_list2_cxx11.cpp + * Output: \verbinclude Array_initializer_list2_cxx11.out + * + * \sa Array::Array(const Scalar& val0, const Scalar& val1) + * \sa Array::Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list); + + /** + * \brief Constructs an array and initializes it by elements given by an initializer list of initializer lists \cpp11 + * + * This constructor distinguishes between the construction of arbitrary array and arrays with one fixed dimension, + * + * In the general case, the constructor takes an initializer list, representing the array rows, that contains for + * each row an initializer list, representing a single column, containing scalar values. Each of the inner + * initializer lists must contain the same number of elements. + * + * In the case of array with one fixed dimension, an initializer list containing just one other initializer list + * that contains the array elements can be passed. Therefore \c Array\c {{1,\c 2,\c 3,\c 4}} is + * legal and the more verbose syntax \c Array\c {{1},\c {2},\c {3},\c {4}} can be avoided. + * + * \warning In the case of fixed-sized arrays, the initializer list size must be equal to the array \a rows rows + * and \a cols columns. + * + * Example: \include Array_initializer_list_cxx11.cpp + * Output: \verbinclude Array_initializer_list_cxx11.out + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list >& list); + #endif // end EIGEN_PARSED_BY_DOXYGEN /** constructs an initialized 3D vector with given coefficients */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 7f4a7af93..5b375b41d 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -301,6 +301,20 @@ class Matrix Base::_check_template_params(); Base::template _init2(x, y); } + + #if EIGEN_HAS_CXX11 + template + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list, + typename internal::enable_if::value, T>::type* = 0, + typename internal::enable_if::type* = 0) : Base(list) {} + + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC @@ -338,7 +352,41 @@ class Matrix /** \brief Constructs an initialized 2D vector with given coefficients */ Matrix(const Scalar& x, const Scalar& y); - #endif + + /** \copydoc PlainObjectBase::PlainObjectBase(const std::initializer_list& list) + * + * Example: \include Matrix_initializer_list2_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list2_cxx11.out + * + * \sa Matrix::Matrix(const Scalar& x, const Scalar& y, const Scalar& z) + * \sa Matrix::Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list); + + /** + * \brief Constructs a matrix and initializes it by elements given by an initializer list of initializer lists \cpp11 + * + * This constructor distinguishes between the construction of arbitrary matrices and matrices with one fixed dimension, + * i.e., vectors or rowvectors. + * + * In the general case, the constructor takes an initializer list, representing the matrix rows, that contains for + * each row an initializer list, representing a single column, containing scalar values. Each of the inner + * initializer lists must contain the same number of elements. + * + * In the case of matrices with one fixed dimension, an initializer list containing just one other initializer list + * that contains the matrix elements can be passed. Therefore \c VectorXi\c {{1,\c 2,\c 3,\c 4}} is legal and the more + * verbose syntax \c VectorXi\c {{1},\c {2},\c {3},\c {4}} can be avoided. + * + * \warning In the case of fixed-sized matrices, the initializer list size must be equal to the matrix \a rows rows + * and \a cols columns. + * + * Example: \include Matrix_initializer_list_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_cxx11.out + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list); + + #endif // end EIGEN_PARSED_BY_DOXYGEN /** \brief Constructs an initialized 3D vector with given coefficients */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index f551dabb0..1a996b0aa 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -526,6 +526,77 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** + * \brief Construct a vector with fixed number of rows or a rowvector with fixed number of + * columns by passing an initializer list \cpp11 + * + * \only_for_vectors + * + * \warning To construct a vector or rowvector of fixed size, the number of values passed through + * the initializer list must match the the fixed number of rows in the vector case or + * the fixed number of columns in the rowvector case. */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list& list); + + /** + * \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list); + #else // EIGEN_PARSED_BY_DOXYGEN + #if EIGEN_HAS_CXX11 + template + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list& list, + typename internal::enable_if::value, T>::type* = 0, + typename internal::enable_if::type* = 0) + : m_storage() + { + _check_template_params(); + EIGEN_STATIC_ASSERT_FIXED_SIZE(PlainObjectBase); + resize(list.size()); + std::copy(list.begin(), list.end(), m_storage.data()); + } + + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) + : m_storage() + { + _check_template_params(); + + size_t list_size = 0; + if (list.begin() != list.end()) { + list_size = list.begin()->size(); + } + + // This is to allow syntax like VectorXi {{1, 2, 3, 4}} + if (ColsAtCompileTime == 1 && list.size() == 1) { + eigen_assert(list_size == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + resize(list_size, ColsAtCompileTime); + std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); + } else { + eigen_assert(list.size() == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + eigen_assert(list_size == static_cast(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); + resize(list.size(), list_size); + + Index row_index = 0; + for (const std::initializer_list& row : list) { + eigen_assert(list_size == row.size()); + Index col_index = 0; + for (const Scalar& e : row) { + coeffRef(row_index, col_index) = e; + ++col_index; + } + ++row_index; + } + } + } + #endif // end EIGEN_HAS_CXX11 + #endif // end EIGEN_PARSED_BY_DOXYGEN + /** \sa PlainObjectBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox index 7ea0cd789..fc0ce5b1e 100644 --- a/doc/TutorialMatrixClass.dox +++ b/doc/TutorialMatrixClass.dox @@ -101,13 +101,40 @@ Matrix3f a(3,3); \endcode and is a no-operation. -Finally, we also offer some constructors to initialize the coefficients of small fixed-size vectors up to size 4: +Additionally, we also offer some constructors to initialize the coefficients of small fixed-size vectors up to size 4: \code Vector2d a(5.0, 6.0); Vector3d b(5.0, 6.0, 7.0); Vector4d c(5.0, 6.0, 7.0, 8.0); \endcode +If C++11 is enabled, matrices can be constructed and initialized using initializer lists. In the case of fixed-sized vectors +and rowvectors a simple initializer list can be passed: +\code +Vector2i a {1, 2}; // A vector containing the elements {1, 2} +Matrix b {1, 2, 3, 4}; // A row-vector containing the elements {1, 2, 3, 4} +Matrix c {1, 2, 3, 4}; // A vector containing the elements {1, 2, 3, 4} +\endcode + +In the case of fixed or dynamically sized matrices an initializer list containing an initializer list for each row +can be passed. If the matrix is fixed-sized, the number of elements that are passed must match the dimensions. +\code +MatrixXi a { + {1, 2}, // first row + {3, 4} // second row +}; +Matrix b { + {2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0}, +}; +\endcode + +In the case of vectors and rowvectors, the following shorthand notation can be used: +\code +VectorXd a {{1.5, 2.5, 3.5}}; // A vector with 3 rows +RowVectorXd b {{1.0, 2.0, 3.0, 4.0}}; // A rowvector with 4 columns +\endcode + \section TutorialMatrixCoeffAccessors Coefficient accessors The primary coefficient accessors and mutators in Eigen are the overloaded parenthesis operators. diff --git a/doc/snippets/Array_initializer_list2_cxx11.cpp b/doc/snippets/Array_initializer_list2_cxx11.cpp new file mode 100644 index 000000000..20e74546a --- /dev/null +++ b/doc/snippets/Array_initializer_list2_cxx11.cpp @@ -0,0 +1,3 @@ +Array a {1, 2, 3, 4, 5, 6}; +Array b {1, 2, 3}; +cout << a << "\n\n" << b << endl; \ No newline at end of file diff --git a/doc/snippets/Array_initializer_list_cxx11.cpp b/doc/snippets/Array_initializer_list_cxx11.cpp new file mode 100644 index 000000000..d2f46e268 --- /dev/null +++ b/doc/snippets/Array_initializer_list_cxx11.cpp @@ -0,0 +1,6 @@ +Array a { + {1, 2, 3}, + {3, 4, 5} +}; +Array v {{1, 2, 3, 4, 5}}; +cout << a << "\n\n" << v << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list2_cxx11.cpp b/doc/snippets/Matrix_initializer_list2_cxx11.cpp new file mode 100644 index 000000000..2fde52b8d --- /dev/null +++ b/doc/snippets/Matrix_initializer_list2_cxx11.cpp @@ -0,0 +1,3 @@ +Matrix a {1, 2, 3, 4, 5, 6}; +Matrix b {1, 2, 3}; +cout << a << "\n\n" << b << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list_cxx11.cpp b/doc/snippets/Matrix_initializer_list_cxx11.cpp new file mode 100644 index 000000000..d68787ab6 --- /dev/null +++ b/doc/snippets/Matrix_initializer_list_cxx11.cpp @@ -0,0 +1,6 @@ +Matrix m { + {1, 2, 3}, + {4, 5, 6} +}; +VectorXi v {{1, 2}}; +cout << m << "\n\n" << v << endl; \ No newline at end of file diff --git a/doc/snippets/Tutorial_std_sort_rows.cpp b/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp similarity index 100% rename from doc/snippets/Tutorial_std_sort_rows.cpp rename to doc/snippets/Tutorial_std_sort_rows_cxx11.cpp diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt index f95503d7e..256e541e2 100644 --- a/failtest/CMakeLists.txt +++ b/failtest/CMakeLists.txt @@ -63,3 +63,8 @@ ei_add_failtest("bdcsvd_int") ei_add_failtest("eigensolver_int") ei_add_failtest("eigensolver_cplx") +if(EIGEN_TEST_CXX11) + ei_add_failtest("initializer_list_1") + ei_add_failtest("initializer_list_2") +endif() + diff --git a/failtest/initializer_list_1.cpp b/failtest/initializer_list_1.cpp new file mode 100644 index 000000000..92dfd1f65 --- /dev/null +++ b/failtest/initializer_list_1.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/Core" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define ROWS Dynamic +#else +#define ROWS 3 +#endif + +using namespace Eigen; + +int main() +{ + Matrix {1, 2, 3}; +} diff --git a/failtest/initializer_list_2.cpp b/failtest/initializer_list_2.cpp new file mode 100644 index 000000000..1996050a7 --- /dev/null +++ b/failtest/initializer_list_2.cpp @@ -0,0 +1,16 @@ +#include "../Eigen/Core" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define ROWS Dynamic +#define COLS Dynamic +#else +#define ROWS 3 +#define COLS 1 +#endif + +using namespace Eigen; + +int main() +{ + Matrix {1, 2, 3}; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 794befa69..3dbb426eb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -287,6 +287,9 @@ ei_add_test(half_float) ei_add_test(array_of_string) ei_add_test(num_dimensions) ei_add_test(stl_iterators) +if(EIGEN_TEST_CXX11) + ei_add_test(initializer_list_construction) +endif() add_executable(bug1213 bug1213.cpp bug1213_main.cpp) diff --git a/test/initializer_list_construction.cpp b/test/initializer_list_construction.cpp new file mode 100644 index 000000000..5f281ea4e --- /dev/null +++ b/test/initializer_list_construction.cpp @@ -0,0 +1,371 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 David Tellenbach +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_STATIC_ASSERT + +#include "main.h" + +template::IsInteger> +struct TestMethodDispatching { + static void run() {} +}; + +template +struct TestMethodDispatching { + static void run() + { + { + Matrix m {3, 4}; + Array a {3, 4}; + VERIFY(m.rows() == 3); + VERIFY(m.cols() == 4); + VERIFY(a.rows() == 3); + VERIFY(a.cols() == 4); + } + { + Matrix m {3, 4}; + Array a {3, 4}; + VERIFY(m(0) == 3); + VERIFY(m(1) == 4); + VERIFY(a(0) == 3); + VERIFY(a(1) == 4); + } + { + Matrix m {3, 4}; + Array a {3, 4}; + VERIFY(m(0) == 3); + VERIFY(m(1) == 4); + VERIFY(a(0) == 3); + VERIFY(a(1) == 4); + } + } +}; + +template void singleInitializerListVectorConstruction() +{ + Scalar raw[4]; + for(int k = 0; k < 4; ++k) { + raw[k] = internal::random(); + } + { + Matrix m { raw[0], raw[1], raw[2], raw[3] }; + Array a { raw[0], raw[1], raw[2], raw[3] }; + for(int k = 0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k = 0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix(raw[0], raw[1], raw[2], raw[3]))); + VERIFY_IS_EQUAL(m, (Matrix({raw[0], raw[1], raw[2], raw[3]}))); + VERIFY((a == (Array(raw[0], raw[1], raw[2], raw[3]))).all()); + VERIFY((a == (Array({raw[0], raw[1], raw[2], raw[3]}))).all()); + } + { + Matrix m { raw[0], raw[1], raw[2], raw[3] }; + Array a { raw[0], raw[1], raw[2], raw[3] }; + for(int k = 0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k = 0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix(raw[0], raw[1], raw[2], raw[3]))); + VERIFY_IS_EQUAL(m, (Matrix({raw[0], raw[1], raw[2], raw[3]}))); + VERIFY((a == (Array(raw[0], raw[1], raw[2], raw[3]))).all()); + VERIFY((a == (Array({raw[0], raw[1], raw[2], raw[3]}))).all()); + } +} + + +template void initializerListVectorConstruction() +{ + Scalar raw[4]; + for(int k = 0; k < 4; ++k) { + raw[k] = internal::random(); + } + { + Matrix m { {raw[0]}, {raw[1]},{raw[2]},{raw[3]} }; + Array a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }; + for(int k = 0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k = 0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))); + VERIFY((a == (Array({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all()); + } + { + Matrix m { {raw[0], raw[1], raw[2], raw[3]} }; + Array a { {raw[0], raw[1], raw[2], raw[3]} }; + for(int k = 0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k = 0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix({{raw[0],raw[1],raw[2],raw[3]}}))); + VERIFY((a == (Array({{raw[0],raw[1],raw[2],raw[3]}}))).all()); + } + { + Matrix m { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }; + Array a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }; + for(int k=0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k=0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))); + VERIFY((a == (Array({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all()); + } + { + Matrix m {{raw[0],raw[1],raw[2],raw[3]}}; + Array a {{raw[0],raw[1],raw[2],raw[3]}}; + for(int k=0; k < 4; ++k) { + VERIFY(m(k) == raw[k]); + } + for(int k=0; k < 4; ++k) { + VERIFY(a(k) == raw[k]); + } + VERIFY_IS_EQUAL(m, (Matrix({{raw[0],raw[1],raw[2],raw[3]}}))); + VERIFY((a == (Array({{raw[0],raw[1],raw[2],raw[3]}}))).all()); + } +} + +template void initializerListMatrixConstruction() +{ + const Index RowsAtCompileTime = 5; + const Index ColsAtCompileTime = 4; + const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime; + + Scalar raw[SizeAtCompileTime]; + for (int i = 0; i < SizeAtCompileTime; ++i) { + raw[i] = internal::random(); + } + { + Matrix m {}; + VERIFY(m.cols() == 0); + VERIFY(m.rows() == 0); + VERIFY_IS_EQUAL(m, (Matrix())); + } + { + Matrix m { + {raw[0], raw[1], raw[2], raw[3]}, + {raw[4], raw[5], raw[6], raw[7]}, + {raw[8], raw[9], raw[10], raw[11]}, + {raw[12], raw[13], raw[14], raw[15]}, + {raw[16], raw[17], raw[18], raw[19]} + }; + + Matrix m2; + m2 << raw[0], raw[1], raw[2], raw[3], + raw[4], raw[5], raw[6], raw[7], + raw[8], raw[9], raw[10], raw[11], + raw[12], raw[13], raw[14], raw[15], + raw[16], raw[17], raw[18], raw[19]; + + int k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + VERIFY(m(i, j) == raw[k]); + ++k; + } + } + VERIFY_IS_EQUAL(m, m2); + } + { + Matrix m{ + {raw[0], raw[1], raw[2], raw[3]}, + {raw[4], raw[5], raw[6], raw[7]}, + {raw[8], raw[9], raw[10], raw[11]}, + {raw[12], raw[13], raw[14], raw[15]}, + {raw[16], raw[17], raw[18], raw[19]} + }; + + VERIFY(m.cols() == 4); + VERIFY(m.rows() == 5); + int k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + VERIFY(m(i, j) == raw[k]); + ++k; + } + } + + Matrix m2(RowsAtCompileTime, ColsAtCompileTime); + k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + m2(i, j) = raw[k]; + ++k; + } + } + VERIFY_IS_EQUAL(m, m2); + } +} + +template void initializerListArrayConstruction() +{ + const Index RowsAtCompileTime = 5; + const Index ColsAtCompileTime = 4; + const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime; + + Scalar raw[SizeAtCompileTime]; + for (int i = 0; i < SizeAtCompileTime; ++i) { + raw[i] = internal::random(); + } + { + Array a {}; + VERIFY(a.cols() == 0); + VERIFY(a.rows() == 0); + } + { + Array m { + {raw[0], raw[1], raw[2], raw[3]}, + {raw[4], raw[5], raw[6], raw[7]}, + {raw[8], raw[9], raw[10], raw[11]}, + {raw[12], raw[13], raw[14], raw[15]}, + {raw[16], raw[17], raw[18], raw[19]} + }; + + Array m2; + m2 << raw[0], raw[1], raw[2], raw[3], + raw[4], raw[5], raw[6], raw[7], + raw[8], raw[9], raw[10], raw[11], + raw[12], raw[13], raw[14], raw[15], + raw[16], raw[17], raw[18], raw[19]; + + int k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + VERIFY(m(i, j) == raw[k]); + ++k; + } + } + VERIFY_IS_APPROX(m, m2); + } + { + Array m { + {raw[0], raw[1], raw[2], raw[3]}, + {raw[4], raw[5], raw[6], raw[7]}, + {raw[8], raw[9], raw[10], raw[11]}, + {raw[12], raw[13], raw[14], raw[15]}, + {raw[16], raw[17], raw[18], raw[19]} + }; + + VERIFY(m.cols() == 4); + VERIFY(m.rows() == 5); + int k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + VERIFY(m(i, j) == raw[k]); + ++k; + } + } + + Array m2(RowsAtCompileTime, ColsAtCompileTime); + k = 0; + for(int i = 0; i < RowsAtCompileTime; ++i) { + for (int j = 0; j < ColsAtCompileTime; ++j) { + m2(i, j) = raw[k]; + ++k; + } + } + VERIFY_IS_APPROX(m, m2); + } +} + +template void dynamicVectorConstruction() +{ + const Index size = 4; + Scalar raw[size]; + for (int i = 0; i < size; ++i) { + raw[i] = internal::random(); + } + + typedef Matrix VectorX; + + { + VectorX v {{raw[0], raw[1], raw[2], raw[3]}}; + for (int i = 0; i < size; ++i) { + VERIFY(v(i) == raw[i]); + } + VERIFY(v.rows() == size); + VERIFY(v.cols() == 1); + VERIFY_IS_EQUAL(v, (VectorX {{raw[0], raw[1], raw[2], raw[3]}})); + } + + { + VERIFY_RAISES_ASSERT((VectorX {raw[0], raw[1], raw[2], raw[3]})); + } + { + VERIFY_RAISES_ASSERT((VectorX { + {raw[0], raw[1], raw[2], raw[3]}, + {raw[0], raw[1], raw[2], raw[3]}, + })); + } +} + +EIGEN_DECLARE_TEST(initializer_list_construction) +{ + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction()); + CALL_SUBTEST_1(initializerListVectorConstruction>()); + CALL_SUBTEST_1(initializerListVectorConstruction>()); + CALL_SUBTEST_1(initializerListVectorConstruction>()); + + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction()); + CALL_SUBTEST_2(initializerListMatrixConstruction>()); + CALL_SUBTEST_2(initializerListMatrixConstruction>()); + CALL_SUBTEST_2(initializerListMatrixConstruction>()); + + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction()); + CALL_SUBTEST_3(initializerListArrayConstruction>()); + CALL_SUBTEST_3(initializerListArrayConstruction>()); + CALL_SUBTEST_3(initializerListArrayConstruction>()); + + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); + CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); + + CALL_SUBTEST_5(TestMethodDispatching::run()); + CALL_SUBTEST_5(TestMethodDispatching::run()); + + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction()); + CALL_SUBTEST_6(dynamicVectorConstruction>()); + CALL_SUBTEST_6(dynamicVectorConstruction>()); + CALL_SUBTEST_6(dynamicVectorConstruction>()); +} \ No newline at end of file From 80f81f9c4b01cc4f513c5b92c52c5a0efb68ecc3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Jan 2019 17:08:47 +0100 Subject: [PATCH 168/295] Cleanup SFINAE in Array/Matrix(initializer_list) ctors and minor doc editing. --- Eigen/src/Core/Array.h | 64 +++++++++--------- Eigen/src/Core/Matrix.h | 65 ++++++++++--------- Eigen/src/Core/PlainObjectBase.h | 22 +++---- doc/TutorialMatrixClass.dox | 29 +++++---- .../Array_initializer_list_23_cxx11.cpp | 5 ++ doc/snippets/Array_initializer_list_cxx11.cpp | 6 -- .../Array_initializer_list_vector_cxx11.cpp | 2 + .../Matrix_initializer_list_23_cxx11.cpp | 5 ++ .../Matrix_initializer_list_cxx11.cpp | 6 -- .../Matrix_initializer_list_vector_cxx11.cpp | 2 + 10 files changed, 109 insertions(+), 97 deletions(-) create mode 100644 doc/snippets/Array_initializer_list_23_cxx11.cpp delete mode 100644 doc/snippets/Array_initializer_list_cxx11.cpp create mode 100644 doc/snippets/Array_initializer_list_vector_cxx11.cpp create mode 100644 doc/snippets/Matrix_initializer_list_23_cxx11.cpp delete mode 100644 doc/snippets/Matrix_initializer_list_cxx11.cpp create mode 100644 doc/snippets/Matrix_initializer_list_vector_cxx11.cpp diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 7ef37de7c..ff0b69b84 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -180,13 +180,13 @@ class Array } #if EIGEN_HAS_CXX11 - template + protected: + enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; + public: + template::value>::type> EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list, - typename internal::enable_if::value, T>::type* = 0, - typename internal::enable_if::type* = 0) : Base(list) {} + explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list) : Base(list) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const std::initializer_list >& list) : Base(list) {} @@ -219,35 +219,39 @@ class Array * Example: \include Array_initializer_list2_cxx11.cpp * Output: \verbinclude Array_initializer_list2_cxx11.out * - * \sa Array::Array(const Scalar& val0, const Scalar& val1) - * \sa Array::Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) */ + * \sa Array(const std::initializer_list >&) + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list); - /** - * \brief Constructs an array and initializes it by elements given by an initializer list of initializer lists \cpp11 - * - * This constructor distinguishes between the construction of arbitrary array and arrays with one fixed dimension, - * - * In the general case, the constructor takes an initializer list, representing the array rows, that contains for - * each row an initializer list, representing a single column, containing scalar values. Each of the inner - * initializer lists must contain the same number of elements. - * - * In the case of array with one fixed dimension, an initializer list containing just one other initializer list - * that contains the array elements can be passed. Therefore \c Array\c {{1,\c 2,\c 3,\c 4}} is - * legal and the more verbose syntax \c Array\c {{1},\c {2},\c {3},\c {4}} can be avoided. - * - * \warning In the case of fixed-sized arrays, the initializer list size must be equal to the array \a rows rows - * and \a cols columns. - * - * Example: \include Array_initializer_list_cxx11.cpp - * Output: \verbinclude Array_initializer_list_cxx11.out - */ + /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Array_initializer_list_23_cxx11.cpp + * Output: \verbinclude Array_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. + * Therefore Array{{1,2,3,4,5}} is legal and the more verbose syntax + * Array{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Array_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Array_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, + * and implicit transposition is allowed for compile-time 1D arrays only. + * + * \sa Array(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const std::initializer_list >& list); #endif // end EIGEN_PARSED_BY_DOXYGEN - /** constructs an initialized 3D vector with given coefficients */ + /** constructs an initialized 3D vector with given coefficients + * \sa Array(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -257,7 +261,9 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients */ + /** constructs an initialized 4D vector with given coefficients + * \sa Array(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 5b375b41d..aaceceafe 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -303,13 +303,13 @@ class Matrix } #if EIGEN_HAS_CXX11 - template + protected: + enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; + public: + template::value>::type> EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list, - typename internal::enable_if::value, T>::type* = 0, - typename internal::enable_if::type* = 0) : Base(list) {} + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list) : Base(list) {} EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} @@ -358,37 +358,40 @@ class Matrix * Example: \include Matrix_initializer_list2_cxx11.cpp * Output: \verbinclude Matrix_initializer_list2_cxx11.out * - * \sa Matrix::Matrix(const Scalar& x, const Scalar& y, const Scalar& z) - * \sa Matrix::Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) */ + * \sa Matrix(const std::initializer_list>&) + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list); - /** - * \brief Constructs a matrix and initializes it by elements given by an initializer list of initializer lists \cpp11 - * - * This constructor distinguishes between the construction of arbitrary matrices and matrices with one fixed dimension, - * i.e., vectors or rowvectors. - * - * In the general case, the constructor takes an initializer list, representing the matrix rows, that contains for - * each row an initializer list, representing a single column, containing scalar values. Each of the inner - * initializer lists must contain the same number of elements. - * - * In the case of matrices with one fixed dimension, an initializer list containing just one other initializer list - * that contains the matrix elements can be passed. Therefore \c VectorXi\c {{1,\c 2,\c 3,\c 4}} is legal and the more - * verbose syntax \c VectorXi\c {{1},\c {2},\c {3},\c {4}} can be avoided. - * - * \warning In the case of fixed-sized matrices, the initializer list size must be equal to the matrix \a rows rows - * and \a cols columns. - * - * Example: \include Matrix_initializer_list_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list_cxx11.out - */ + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax + * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list); #endif // end EIGEN_PARSED_BY_DOXYGEN - /** \brief Constructs an initialized 3D vector with given coefficients */ + /** \brief Constructs an initialized 3D vector with given coefficients + * \sa Matrix(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -398,7 +401,9 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients */ + /** \brief Constructs an initialized 4D vector with given coefficients + * \sa Matrix(const std::initializer_list&) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1a996b0aa..04748e5e9 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -527,21 +527,19 @@ class PlainObjectBase : public internal::dense_xpr_base::type } #ifdef EIGEN_PARSED_BY_DOXYGEN - /** - * \brief Construct a vector with fixed number of rows or a rowvector with fixed number of - * columns by passing an initializer list \cpp11 - * - * \only_for_vectors - * - * \warning To construct a vector or rowvector of fixed size, the number of values passed through - * the initializer list must match the the fixed number of rows in the vector case or - * the fixed number of columns in the rowvector case. */ + /** \brief Construct a row of column vector with fixed size from an initializer list of coefficients. \cpp11 + * + * \only_for_vectors + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed through + * the initializer list must match the the fixed number of rows (resp. columns) of \c *this. + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list& list); - /** - * \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer - * lists \cpp11 */ + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list); #else // EIGEN_PARSED_BY_DOXYGEN diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox index fc0ce5b1e..c44c8f24f 100644 --- a/doc/TutorialMatrixClass.dox +++ b/doc/TutorialMatrixClass.dox @@ -101,38 +101,39 @@ Matrix3f a(3,3); \endcode and is a no-operation. -Additionally, we also offer some constructors to initialize the coefficients of small fixed-size vectors up to size 4: +Matrices and vectors can also be initialized from lists of coefficients. +Prior to C++11, this feature is limited to small fixed-size column or vectors up to size 4: \code Vector2d a(5.0, 6.0); Vector3d b(5.0, 6.0, 7.0); Vector4d c(5.0, 6.0, 7.0, 8.0); \endcode -If C++11 is enabled, matrices can be constructed and initialized using initializer lists. In the case of fixed-sized vectors -and rowvectors a simple initializer list can be passed: +If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized through a single initializer list (\link Matrix::Matrix(const std::initializer_list&) details \endlink): \code -Vector2i a {1, 2}; // A vector containing the elements {1, 2} -Matrix b {1, 2, 3, 4}; // A row-vector containing the elements {1, 2, 3, 4} -Matrix c {1, 2, 3, 4}; // A vector containing the elements {1, 2, 3, 4} +Vector2i a {1, 2}; // A column vector containing the elements {1, 2} +Matrix b {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} +Matrix c {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} \endcode -In the case of fixed or dynamically sized matrices an initializer list containing an initializer list for each row -can be passed. If the matrix is fixed-sized, the number of elements that are passed must match the dimensions. +In the general case of matrices and vectors with either fixed or runtime sizes, +coefficients have to be grouped by rows and passed as an initializer list of initializer list (\link Matrix::Matrix(const std::initializer_list>&) details \endlink): \code -MatrixXi a { +MatrixXi a { // construct a 2x2 matrix {1, 2}, // first row {3, 4} // second row }; Matrix b { - {2.0, 3.0, 4.0}, - {5.0, 6.0, 7.0}, + {2, 3, 4}, + {5, 6, 7}, }; \endcode -In the case of vectors and rowvectors, the following shorthand notation can be used: +For column or row vectors, implicit transposition is allowed. +This means that a column vector can be initialized from a single row: \code -VectorXd a {{1.5, 2.5, 3.5}}; // A vector with 3 rows -RowVectorXd b {{1.0, 2.0, 3.0, 4.0}}; // A rowvector with 4 columns +VectorXd a {{1.5, 2.5, 3.5}}; // A column-vector with 3 coefficients +RowVectorXd b {{1.0, 2.0, 3.0, 4.0}}; // A row-vector with 4 coefficients \endcode \section TutorialMatrixCoeffAccessors Coefficient accessors diff --git a/doc/snippets/Array_initializer_list_23_cxx11.cpp b/doc/snippets/Array_initializer_list_23_cxx11.cpp new file mode 100644 index 000000000..1ea32dd80 --- /dev/null +++ b/doc/snippets/Array_initializer_list_23_cxx11.cpp @@ -0,0 +1,5 @@ +ArrayXXi a { + {1, 2, 3}, + {3, 4, 5} +}; +cout << a << endl; \ No newline at end of file diff --git a/doc/snippets/Array_initializer_list_cxx11.cpp b/doc/snippets/Array_initializer_list_cxx11.cpp deleted file mode 100644 index d2f46e268..000000000 --- a/doc/snippets/Array_initializer_list_cxx11.cpp +++ /dev/null @@ -1,6 +0,0 @@ -Array a { - {1, 2, 3}, - {3, 4, 5} -}; -Array v {{1, 2, 3, 4, 5}}; -cout << a << "\n\n" << v << endl; \ No newline at end of file diff --git a/doc/snippets/Array_initializer_list_vector_cxx11.cpp b/doc/snippets/Array_initializer_list_vector_cxx11.cpp new file mode 100644 index 000000000..e38b61e95 --- /dev/null +++ b/doc/snippets/Array_initializer_list_vector_cxx11.cpp @@ -0,0 +1,2 @@ +Array v {{1, 2, 3, 4, 5}}; +cout << v << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list_23_cxx11.cpp b/doc/snippets/Matrix_initializer_list_23_cxx11.cpp new file mode 100644 index 000000000..d338d0253 --- /dev/null +++ b/doc/snippets/Matrix_initializer_list_23_cxx11.cpp @@ -0,0 +1,5 @@ +MatrixXd m { + {1, 2, 3}, + {4, 5, 6} +}; +cout << m << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list_cxx11.cpp b/doc/snippets/Matrix_initializer_list_cxx11.cpp deleted file mode 100644 index d68787ab6..000000000 --- a/doc/snippets/Matrix_initializer_list_cxx11.cpp +++ /dev/null @@ -1,6 +0,0 @@ -Matrix m { - {1, 2, 3}, - {4, 5, 6} -}; -VectorXi v {{1, 2}}; -cout << m << "\n\n" << v << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp b/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp new file mode 100644 index 000000000..8872e2cf3 --- /dev/null +++ b/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp @@ -0,0 +1,2 @@ +VectorXi v {{1, 2}}; +cout << v << endl; \ No newline at end of file From e16913a45f92db9d4080c06515537ee287fdb153 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 23 Jan 2019 10:35:06 +0100 Subject: [PATCH 169/295] Fix name of tutorial snippet. --- doc/TutorialSTL.dox | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/TutorialSTL.dox b/doc/TutorialSTL.dox index 9f41528d3..9a825bc48 100644 --- a/doc/TutorialSTL.dox +++ b/doc/TutorialSTL.dox @@ -55,10 +55,10 @@ Here is an example sorting each row of a matrix:
Example:Output:
-\include Tutorial_std_sort_rows.cpp +\include Tutorial_std_sort_rows_cxx11.cpp -\verbinclude Tutorial_std_sort_rows.out +\verbinclude Tutorial_std_sort_rows_cxx11.out
*/ From c64d5d3827f8d1ab0f33198721862115efe871da Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 23 Jan 2019 23:43:13 +0100 Subject: [PATCH 170/295] Bypass inline asm for non compatible compilers. --- CMakeLists.txt | 1 - bench/BenchTimer.h | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 48c0a6367..76e083314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,7 +567,6 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0) # Imported target support add_library (eigen INTERFACE) add_library (Eigen3::Eigen ALIAS eigen) - target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) target_include_directories (eigen INTERFACE $ diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index ea28496b7..8a0dbbe81 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -28,11 +28,15 @@ #endif static void escape(void *p) { +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG asm volatile("" : : "g"(p) : "memory"); +#endif } static void clobber() { +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG asm volatile("" : : : "memory"); +#endif } #include From bd6dadcda8974622bdc4c731068e4b3cf84bcf9c Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 24 Jan 2019 00:14:02 +0100 Subject: [PATCH 171/295] Tell doxygen that cxx11 math is available --- doc/Doxyfile.in | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 3ebbeb812..d8f28812e 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1592,6 +1592,7 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EIGEN_QT_SUPPORT \ EIGEN_STRONG_INLINE=inline \ EIGEN_DEVICE_FUNC= \ + EIGEN_HAS_CXX11_MATH=1 \ "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template const CwiseBinaryOp, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const;" \ "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp, const LHS, const RHS>"\ "EIGEN_CAT2(a,b)= a ## b"\ From 237b03b3724df7137e82512cd7ad758e20b8e6b6 Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Wed, 23 Jan 2019 00:07:19 +0100 Subject: [PATCH 172/295] PR 574: use variadic template instead of initializer_list to implement fixed-size vector ctor from coefficients. --- Eigen/src/Core/Array.h | 27 +++++++------- Eigen/src/Core/Matrix.h | 21 +++++------ Eigen/src/Core/PlainObjectBase.h | 37 +++++++++++-------- doc/TutorialMatrixClass.dox | 6 +-- ...xx11.cpp => Array_variadic_ctor_cxx11.cpp} | 2 +- ...x11.cpp => Matrix_variadic_ctor_cxx11.cpp} | 2 +- 6 files changed, 51 insertions(+), 44 deletions(-) rename doc/snippets/{Array_initializer_list2_cxx11.cpp => Array_variadic_ctor_cxx11.cpp} (61%) rename doc/snippets/{Matrix_initializer_list2_cxx11.cpp => Matrix_variadic_ctor_cxx11.cpp} (61%) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index ff0b69b84..3a2a4fc58 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -183,10 +183,10 @@ class Array protected: enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; public: - template::value>::type> - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list) : Base(list) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const std::initializer_list >& list) : Base(list) {} @@ -214,15 +214,16 @@ class Array /** constructs an initialized 2D vector with given coefficients */ Array(const Scalar& val0, const Scalar& val1); - /** \copydoc PlainObjectBase::PlainObjectBase(const std::initializer_list& list) - * - * Example: \include Array_initializer_list2_cxx11.cpp - * Output: \verbinclude Array_initializer_list2_cxx11.out - * - * \sa Array(const std::initializer_list >&) - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Array(const std::initializer_list& list); + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + * + * Example: \include Array_variadic_ctor_cxx11.cpp + * Output: \verbinclude Array_variadic_ctor_cxx11.out + * + * \sa Array(const std::initializer_list>&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 * diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index aaceceafe..fb2a62a82 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -303,13 +303,11 @@ class Matrix } #if EIGEN_HAS_CXX11 - protected: - enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; public: - template::value>::type> - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list) : Base(list) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} @@ -353,15 +351,16 @@ class Matrix /** \brief Constructs an initialized 2D vector with given coefficients */ Matrix(const Scalar& x, const Scalar& y); - /** \copydoc PlainObjectBase::PlainObjectBase(const std::initializer_list& list) + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) * - * Example: \include Matrix_initializer_list2_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list2_cxx11.out + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out * * \sa Matrix(const std::initializer_list>&) */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list& list); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 * diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 04748e5e9..cd89fd365 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -527,15 +527,16 @@ class PlainObjectBase : public internal::dense_xpr_base::type } #ifdef EIGEN_PARSED_BY_DOXYGEN - /** \brief Construct a row of column vector with fixed size from an initializer list of coefficients. \cpp11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 * * \only_for_vectors * - * \warning To construct a column (resp. row) vector of fixed length, the number of values passed through - * the initializer list must match the the fixed number of rows (resp. columns) of \c *this. + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list& list); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer * lists \cpp11 @@ -544,19 +545,25 @@ class PlainObjectBase : public internal::dense_xpr_base::type explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list); #else // EIGEN_PARSED_BY_DOXYGEN #if EIGEN_HAS_CXX11 - template - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list& list, - typename internal::enable_if::value, T>::type* = 0, - typename internal::enable_if::type* = 0) + + protected: + enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; + public: + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) : m_storage() { _check_template_params(); - EIGEN_STATIC_ASSERT_FIXED_SIZE(PlainObjectBase); - resize(list.size()); - std::copy(list.begin(), list.end(), m_storage.data()); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); + m_storage.data()[0] = a0; + m_storage.data()[1] = a1; + m_storage.data()[2] = a2; + m_storage.data()[3] = a3; + int i = 4; + auto x = {(m_storage.data()[i++] = args, 0)...}; + static_cast(x); } EIGEN_DEVICE_FUNC diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox index c44c8f24f..2c452220f 100644 --- a/doc/TutorialMatrixClass.dox +++ b/doc/TutorialMatrixClass.dox @@ -109,11 +109,11 @@ Vector3d b(5.0, 6.0, 7.0); Vector4d c(5.0, 6.0, 7.0, 8.0); \endcode -If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized through a single initializer list (\link Matrix::Matrix(const std::initializer_list&) details \endlink): +If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients: \code -Vector2i a {1, 2}; // A column vector containing the elements {1, 2} +Vector2i a(1, 2); // A column vector containing the elements {1, 2} Matrix b {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} -Matrix c {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} +Matrix c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} \endcode In the general case of matrices and vectors with either fixed or runtime sizes, diff --git a/doc/snippets/Array_initializer_list2_cxx11.cpp b/doc/snippets/Array_variadic_ctor_cxx11.cpp similarity index 61% rename from doc/snippets/Array_initializer_list2_cxx11.cpp rename to doc/snippets/Array_variadic_ctor_cxx11.cpp index 20e74546a..234c7a720 100644 --- a/doc/snippets/Array_initializer_list2_cxx11.cpp +++ b/doc/snippets/Array_variadic_ctor_cxx11.cpp @@ -1,3 +1,3 @@ -Array a {1, 2, 3, 4, 5, 6}; +Array a(1, 2, 3, 4, 5, 6); Array b {1, 2, 3}; cout << a << "\n\n" << b << endl; \ No newline at end of file diff --git a/doc/snippets/Matrix_initializer_list2_cxx11.cpp b/doc/snippets/Matrix_variadic_ctor_cxx11.cpp similarity index 61% rename from doc/snippets/Matrix_initializer_list2_cxx11.cpp rename to doc/snippets/Matrix_variadic_ctor_cxx11.cpp index 2fde52b8d..fcb4ccf88 100644 --- a/doc/snippets/Matrix_initializer_list2_cxx11.cpp +++ b/doc/snippets/Matrix_variadic_ctor_cxx11.cpp @@ -1,3 +1,3 @@ -Matrix a {1, 2, 3, 4, 5, 6}; +Matrix a(1, 2, 3, 4, 5, 6); Matrix b {1, 2, 3}; cout << a << "\n\n" << b << endl; \ No newline at end of file From 6908ce2a15887e5e5102a875a9b6f632d98c0038 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Jan 2019 10:24:28 +0100 Subject: [PATCH 173/295] More thoroughly check variadic template ctor of fixed-size vectors --- test/initializer_list_construction.cpp | 97 +++++++++++++++----------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/test/initializer_list_construction.cpp b/test/initializer_list_construction.cpp index 5f281ea4e..0d1c6f2f3 100644 --- a/test/initializer_list_construction.cpp +++ b/test/initializer_list_construction.cpp @@ -47,42 +47,60 @@ struct TestMethodDispatching { } }; -template void singleInitializerListVectorConstruction() +template void fixedsizeVariadicVectorConstruction2() { - Scalar raw[4]; - for(int k = 0; k < 4; ++k) { - raw[k] = internal::random(); + { + Vec4 ref = Vec4::Random(); + Vec4 v{ ref[0], ref[1], ref[2], ref[3] }; + VERIFY_IS_APPROX(v, ref); + VERIFY_IS_APPROX(v, (Vec4( ref[0], ref[1], ref[2], ref[3] ))); + VERIFY_IS_APPROX(v, (Vec4({ref[0], ref[1], ref[2], ref[3]}))); + + Vec4 v2 = { ref[0], ref[1], ref[2], ref[3] }; + VERIFY_IS_APPROX(v2, ref); } { - Matrix m { raw[0], raw[1], raw[2], raw[3] }; - Array a { raw[0], raw[1], raw[2], raw[3] }; - for(int k = 0; k < 4; ++k) { - VERIFY(m(k) == raw[k]); - } - for(int k = 0; k < 4; ++k) { - VERIFY(a(k) == raw[k]); - } - VERIFY_IS_EQUAL(m, (Matrix(raw[0], raw[1], raw[2], raw[3]))); - VERIFY_IS_EQUAL(m, (Matrix({raw[0], raw[1], raw[2], raw[3]}))); - VERIFY((a == (Array(raw[0], raw[1], raw[2], raw[3]))).all()); - VERIFY((a == (Array({raw[0], raw[1], raw[2], raw[3]}))).all()); - } - { - Matrix m { raw[0], raw[1], raw[2], raw[3] }; - Array a { raw[0], raw[1], raw[2], raw[3] }; - for(int k = 0; k < 4; ++k) { - VERIFY(m(k) == raw[k]); - } - for(int k = 0; k < 4; ++k) { - VERIFY(a(k) == raw[k]); - } - VERIFY_IS_EQUAL(m, (Matrix(raw[0], raw[1], raw[2], raw[3]))); - VERIFY_IS_EQUAL(m, (Matrix({raw[0], raw[1], raw[2], raw[3]}))); - VERIFY((a == (Array(raw[0], raw[1], raw[2], raw[3]))).all()); - VERIFY((a == (Array({raw[0], raw[1], raw[2], raw[3]}))).all()); + Vec5 ref = Vec5::Random(); + Vec5 v{ ref[0], ref[1], ref[2], ref[3], ref[4] }; + VERIFY_IS_APPROX(v, ref); + VERIFY_IS_APPROX(v, (Vec5( ref[0], ref[1], ref[2], ref[3], ref[4] ))); + VERIFY_IS_APPROX(v, (Vec5({ref[0], ref[1], ref[2], ref[3], ref[4]}))); + + Vec5 v2 = { ref[0], ref[1], ref[2], ref[3], ref[4] }; + VERIFY_IS_APPROX(v2, ref); } } +#define CHECK_MIXSCALAR_V5_APPROX(V, A0, A1, A2, A3, A4) { \ + VERIFY_IS_APPROX(V[0], Scalar(A0) ); \ + VERIFY_IS_APPROX(V[1], Scalar(A1) ); \ + VERIFY_IS_APPROX(V[2], Scalar(A2) ); \ + VERIFY_IS_APPROX(V[3], Scalar(A3) ); \ + VERIFY_IS_APPROX(V[4], Scalar(A4) ); \ +} + +#define CHECK_MIXSCALAR_V5(VEC5, A0, A1, A2, A3, A4) { \ + typedef VEC5::Scalar Scalar; \ + VEC5 v = { A0 , A1 , A2 , A3 , A4 }; \ + CHECK_MIXSCALAR_V5_APPROX(v, A0 , A1 , A2 , A3 , A4); \ +} + +template void fixedsizeVariadicVectorConstruction3() +{ + typedef Matrix Vec5; + typedef Array Arr5; + CHECK_MIXSCALAR_V5(Vec5, 1, 2., -3, 4.121, 5.53252); + CHECK_MIXSCALAR_V5(Arr5, 1, 2., 3.12f, 4.121, 5.53252); +} + +template void fixedsizeVariadicVectorConstruction() +{ + CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2, Matrix >() )); + CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2, Matrix >() )); + CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2, Array >() )); + CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2, Array >() )); +} + template void initializerListVectorConstruction() { @@ -346,15 +364,16 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_3(initializerListArrayConstruction>()); CALL_SUBTEST_3(initializerListArrayConstruction>()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); - CALL_SUBTEST_4(singleInitializerListVectorConstruction>()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); + CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction3<0>()); CALL_SUBTEST_5(TestMethodDispatching::run()); CALL_SUBTEST_5(TestMethodDispatching::run()); From ec8a387972650cda5ad32da5f89659631ad3008a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Jan 2019 10:24:45 +0100 Subject: [PATCH 174/295] cleanup --- Eigen/src/Core/Array.h | 73 ++++++++++++++---------------- Eigen/src/Core/Matrix.h | 77 +++++++++++++++----------------- Eigen/src/Core/PlainObjectBase.h | 25 +++-------- doc/Doxyfile.in | 1 + 4 files changed, 76 insertions(+), 100 deletions(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 3a2a4fc58..e58e68eda 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -180,50 +180,18 @@ class Array } #if EIGEN_HAS_CXX11 - protected: - enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; - public: - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - : Base(a0, a1, a2, a3, args...) {} - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const std::initializer_list >& list) : Base(list) {} - #endif // end EIGEN_HAS_CXX11 - - #else - /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ - EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); - /** Constructs a vector or row-vector with given dimension. \only_for_vectors - * - * Note that this is only useful for dynamic-size vectors. For fixed-size vectors, - * it is redundant to pass the dimension here, so it makes more sense to use the default - * constructor Array() instead. - */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient */ - Array(const Scalar& value); - /** constructs an uninitialized array with \a rows rows and \a cols columns. - * - * This is useful for dynamic-size arrays. For fixed-size arrays, - * it is redundant to pass these parameters, so one should use the default constructor - * Array() instead. */ - Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients */ - Array(const Scalar& val0, const Scalar& val1); - - /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) * * Example: \include Array_variadic_ctor_cxx11.cpp * Output: \verbinclude Array_variadic_ctor_cxx11.out * * \sa Array(const std::initializer_list>&) + * \sa Array(Scalar), Array(Scalar,Scalar) */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 * @@ -244,14 +212,39 @@ class Array * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, * and implicit transposition is allowed for compile-time 1D arrays only. * - * \sa Array(const std::initializer_list&) + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const std::initializer_list >& list); + EIGEN_STRONG_INLINE Array(const std::initializer_list >& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + + #else + /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ + EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); + /** Constructs a vector or row-vector with given dimension. \only_for_vectors + * + * Note that this is only useful for dynamic-size vectors. For fixed-size vectors, + * it is redundant to pass the dimension here, so it makes more sense to use the default + * constructor Array() instead. + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit Array(Index dim); + /** constructs an initialized 1x1 Array with the given coefficient + * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ + Array(const Scalar& value); + /** constructs an uninitialized array with \a rows rows and \a cols columns. + * + * This is useful for dynamic-size arrays. For fixed-size arrays, + * it is redundant to pass these parameters, so one should use the default constructor + * Array() instead. */ + Array(Index rows, Index cols); + /** constructs an initialized 2D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ + Array(const Scalar& val0, const Scalar& val1); #endif // end EIGEN_PARSED_BY_DOXYGEN /** constructs an initialized 3D vector with given coefficients - * \sa Array(const std::initializer_list&) + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) @@ -263,7 +256,7 @@ class Array m_storage.data()[2] = val2; } /** constructs an initialized 4D vector with given coefficients - * \sa Array(const std::initializer_list&) + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index fb2a62a82..32269ed2e 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -303,13 +303,40 @@ class Matrix } #if EIGEN_HAS_CXX11 - public: + /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + * + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out + * + * \sa Matrix(const std::initializer_list>&) + */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) : Base(a0, a1, a2, a3, args...) {} - EIGEN_DEVICE_FUNC + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax + * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ + EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} #endif // end EIGEN_HAS_CXX11 @@ -331,7 +358,8 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * @@ -348,48 +376,13 @@ class Matrix EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - /** \brief Constructs an initialized 2D vector with given coefficients */ + /** \brief Constructs an initialized 2D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x, const Scalar& y); - - /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - * - * Example: \include Matrix_variadic_ctor_cxx11.cpp - * Output: \verbinclude Matrix_variadic_ctor_cxx11.out - * - * \sa Matrix(const std::initializer_list>&) - */ - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); - - /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 - * - * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: - * - * Example: \include Matrix_initializer_list_23_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list_23_cxx11.out - * - * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. - * - * In the case of a compile-time column vector, implicit transposition from a single row is allowed. - * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax - * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: - * - * Example: \include Matrix_initializer_list_vector_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out - * - * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, - * and implicit transposition is allowed for compile-time vectors only. - * - * \sa Matrix(const std::initializer_list&) - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list); - #endif // end EIGEN_PARSED_BY_DOXYGEN /** \brief Constructs an initialized 3D vector with given coefficients - * \sa Matrix(const std::initializer_list&) + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) @@ -401,7 +394,7 @@ class Matrix m_storage.data()[2] = z; } /** \brief Constructs an initialized 4D vector with given coefficients - * \sa Matrix(const std::initializer_list&) + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index cd89fd365..2deaa5aab 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -526,32 +526,19 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - #ifdef EIGEN_PARSED_BY_DOXYGEN + #if EIGEN_HAS_CXX11 /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 * * \only_for_vectors * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists c++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this * constructor must match the the fixed number of rows (resp. columns) of \c *this. */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args); - - /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer - * lists \cpp11 - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list); - #else // EIGEN_PARSED_BY_DOXYGEN - #if EIGEN_HAS_CXX11 - - protected: - enum { IsFixedSizeVectorAtCompileTime = RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic && IsVectorAtCompileTime == 1 }; - public: - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) : m_storage() { @@ -566,6 +553,9 @@ class PlainObjectBase : public internal::dense_xpr_base::type static_cast(x); } + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) : m_storage() @@ -600,7 +590,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type } } #endif // end EIGEN_HAS_CXX11 - #endif // end EIGEN_PARSED_BY_DOXYGEN /** \sa PlainObjectBase::operator=(const EigenBase&) */ template diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index d8f28812e..5671986b1 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1592,6 +1592,7 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EIGEN_QT_SUPPORT \ EIGEN_STRONG_INLINE=inline \ EIGEN_DEVICE_FUNC= \ + EIGEN_HAS_CXX11=1 \ EIGEN_HAS_CXX11_MATH=1 \ "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template const CwiseBinaryOp, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const;" \ "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp, const LHS, const RHS>"\ From 934b8a1304f4d210520c1b158c2ee3da78062532 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 25 Jan 2019 14:54:39 +0100 Subject: [PATCH 175/295] Avoid `I` as an identifier, since it may clash with the C-header complex.h --- test/bicgstab.cpp | 8 ++-- test/boostmultiprec.cpp | 1 + test/conjugate_gradient.cpp | 4 +- test/incomplete_cholesky.cpp | 14 +++---- test/indexed_view.cpp | 4 +- test/main.h | 2 + test/simplicial_cholesky.cpp | 8 ++-- .../Eigen/CXX11/src/util/CXX11Workarounds.h | 6 +-- .../Eigen/CXX11/src/util/EmulateArray.h | 20 ++++----- .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 16 +++---- .../Eigen/src/EulerAngles/EulerSystem.h | 42 +++++++++---------- unsupported/test/EulerAngles.cpp | 8 ++-- 12 files changed, 68 insertions(+), 65 deletions(-) diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp index 89d6a45ef..59c4b501c 100644 --- a/test/bicgstab.cpp +++ b/test/bicgstab.cpp @@ -10,11 +10,11 @@ #include "sparse_solver.h" #include -template void test_bicgstab_T() +template void test_bicgstab_T() { - BiCGSTAB, DiagonalPreconditioner > bicgstab_colmajor_diag; - BiCGSTAB, IdentityPreconditioner > bicgstab_colmajor_I; - BiCGSTAB, IncompleteLUT > bicgstab_colmajor_ilut; + BiCGSTAB, DiagonalPreconditioner > bicgstab_colmajor_diag; + BiCGSTAB, IdentityPreconditioner > bicgstab_colmajor_I; + BiCGSTAB, IncompleteLUT > bicgstab_colmajor_ilut; //BiCGSTAB, SSORPreconditioner > bicgstab_colmajor_ssor; bicgstab_colmajor_diag.setTolerance(NumTraits::epsilon()*4); diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp index 579a6fd25..1d1441ae2 100644 --- a/test/boostmultiprec.cpp +++ b/test/boostmultiprec.cpp @@ -66,6 +66,7 @@ #undef isnan #undef isinf #undef isfinite +#undef I #include #include diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 47a4ca707..b076a126b 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -10,9 +10,9 @@ #include "sparse_solver.h" #include -template void test_conjugate_gradient_T() +template void test_conjugate_gradient_T() { - typedef SparseMatrix SparseMatrixType; + typedef SparseMatrix SparseMatrixType; ConjugateGradient cg_colmajor_lower_diag; ConjugateGradient cg_colmajor_upper_diag; ConjugateGradient cg_colmajor_loup_diag; diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 68fe7d507..ecc17f5c3 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -12,14 +12,14 @@ #include #include -template void test_incomplete_cholesky_T() +template void test_incomplete_cholesky_T() { - typedef SparseMatrix SparseMatrixType; - ConjugateGradient > > cg_illt_lower_amd; - ConjugateGradient > > cg_illt_lower_nat; - ConjugateGradient > > cg_illt_upper_amd; - ConjugateGradient > > cg_illt_upper_nat; - ConjugateGradient > > cg_illt_uplo_amd; + typedef SparseMatrix SparseMatrixType; + ConjugateGradient > > cg_illt_lower_amd; + ConjugateGradient > > cg_illt_lower_nat; + ConjugateGradient > > cg_illt_upper_amd; + ConjugateGradient > > cg_illt_upper_nat; + ConjugateGradient > > cg_illt_uplo_amd; CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) ); diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 6518642df..6d6ef0cd4 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -335,8 +335,8 @@ void check_indexed_view() VERIFY_IS_APPROX( A(B.RowsAtCompileTime, 1), A(4,1) ); VERIFY_IS_APPROX( A(B.RowsAtCompileTime-1, B.ColsAtCompileTime-1), A(3,3) ); VERIFY_IS_APPROX( A(B.RowsAtCompileTime, B.ColsAtCompileTime), A(4,4) ); - const Index I = 3, J = 4; - VERIFY_IS_APPROX( A(I,J), A(3,4) ); + const Index I_ = 3, J_ = 4; + VERIFY_IS_APPROX( A(I_,J_), A(3,4) ); } // check extended block API diff --git a/test/main.h b/test/main.h index 8a68a84ee..0b5821cd5 100644 --- a/test/main.h +++ b/test/main.h @@ -97,6 +97,8 @@ #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER +// `I` may be defined by complex.h: +#define I FORBIDDEN_IDENTIFIER // Unit tests calling Eigen's blas library must preserve the default blocking size // to avoid troubles. diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp index 314b903e2..e3c31e3ba 100644 --- a/test/simplicial_cholesky.cpp +++ b/test/simplicial_cholesky.cpp @@ -9,17 +9,17 @@ #include "sparse_solver.h" -template void test_simplicial_cholesky_T() +template void test_simplicial_cholesky_T() { - typedef SparseMatrix SparseMatrixType; + typedef SparseMatrix SparseMatrixType; SimplicialCholesky chol_colmajor_lower_amd; SimplicialCholesky chol_colmajor_upper_amd; SimplicialLLT< SparseMatrixType, Lower> llt_colmajor_lower_amd; SimplicialLLT< SparseMatrixType, Upper> llt_colmajor_upper_amd; SimplicialLDLT< SparseMatrixType, Lower> ldlt_colmajor_lower_amd; SimplicialLDLT< SparseMatrixType, Upper> ldlt_colmajor_upper_amd; - SimplicialLDLT< SparseMatrixType, Lower, NaturalOrdering > ldlt_colmajor_lower_nat; - SimplicialLDLT< SparseMatrixType, Upper, NaturalOrdering > ldlt_colmajor_upper_nat; + SimplicialLDLT< SparseMatrixType, Lower, NaturalOrdering > ldlt_colmajor_lower_nat; + SimplicialLDLT< SparseMatrixType, Upper, NaturalOrdering > ldlt_colmajor_upper_nat; check_sparse_spd_solving(chol_colmajor_lower_amd); check_sparse_spd_solving(chol_colmajor_upper_amd); diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h index fe4d22803..f1c0284ea 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h @@ -47,9 +47,9 @@ namespace internal { */ -template constexpr inline T& array_get(std::vector& a) { return a[I]; } -template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } -template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } +template constexpr inline T& array_get(std::vector& a) { return a[I_]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I_]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I_]; } /* Suppose you have a template of the form * template struct X; diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 39c255791..834b20b55 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -197,13 +197,13 @@ EIGEN_DEVICE_FUNC bool operator==(const array& lhs, const array& rhs) namespace internal { -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { - return a[I]; + return a[I_]; } -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { - return a[I]; + return a[I_]; } template struct array_size > { @@ -240,16 +240,16 @@ namespace internal { * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 -#define STD_GET_ARR_HACK a._M_instance[I] +#define STD_GET_ARR_HACK a._M_instance[I_] #elif defined(_LIBCPP_VERSION) -#define STD_GET_ARR_HACK a.__elems_[I] +#define STD_GET_ARR_HACK a.__elems_[I_] #else -#define STD_GET_ARR_HACK std::template get(a) +#define STD_GET_ARR_HACK std::template get(a) #endif -template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } -template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } -template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } +template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } +template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } #undef STD_GET_ARR_HACK diff --git a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h index 8a536faf6..d02d86f85 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h @@ -166,13 +166,13 @@ array repeat(t v) { return array; } -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list&) { - return get >::value; + return get >::value; } -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list&) { - return get >::value; + return get >::value; } template @@ -200,13 +200,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { } -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { - return a[I]; + return a[I_]; } -template +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { - return a[I]; + return a[I_]; } struct sum_op { diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h index 88acabcf8..2a833b0a4 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -177,9 +177,9 @@ namespace Eigen // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. // They are used in this class converters. // They are always different from each other, and their possible values are: 0, 1, or 2. - I = AlphaAxisAbs - 1, - J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3, - K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3 + I_ = AlphaAxisAbs - 1, + J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3, + K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3 ; // TODO: Get @mat parameter in form that avoids double evaluation. @@ -194,24 +194,24 @@ namespace Eigen const Scalar plusMinus = IsEven? 1 : -1; const Scalar minusPlus = IsOdd? 1 : -1; - const Scalar Rsum = sqrt((mat(I,I) * mat(I,I) + mat(I,J) * mat(I,J) + mat(J,K) * mat(J,K) + mat(K,K) * mat(K,K))/2); - res[1] = atan2(plusMinus * mat(I,K), Rsum); + const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2); + res[1] = atan2(plusMinus * mat(I_,K_), Rsum); // There is a singularity when cos(beta) == 0 if(Rsum > 4 * NumTraits::epsilon()) {// cos(beta) != 0 - res[0] = atan2(minusPlus * mat(J, K), mat(K, K)); - res[2] = atan2(minusPlus * mat(I, J), mat(I, I)); + res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_)); + res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_)); } - else if(plusMinus * mat(I, K) > 0) {// cos(beta) == 0 and sin(beta) == 1 - Scalar spos = mat(J, I) + plusMinus * mat(K, J); // 2*sin(alpha + plusMinus * gamma - Scalar cpos = mat(J, J) + minusPlus * mat(K, I); // 2*cos(alpha + plusMinus * gamma) + else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1 + Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma + Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma) Scalar alphaPlusMinusGamma = atan2(spos, cpos); res[0] = alphaPlusMinusGamma; res[2] = 0; } else {// cos(beta) == 0 and sin(beta) == -1 - Scalar sneg = plusMinus * (mat(K, J) + minusPlus * mat(J, I)); // 2*sin(alpha + minusPlus*gamma) - Scalar cneg = mat(J, J) + plusMinus * mat(K, I); // 2*cos(alpha + minusPlus*gamma) + Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma) + Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_); // 2*cos(alpha + minusPlus*gamma) Scalar alphaMinusPlusBeta = atan2(sneg, cneg); res[0] = alphaMinusPlusBeta; res[2] = 0; @@ -230,24 +230,24 @@ namespace Eigen const Scalar plusMinus = IsEven? 1 : -1; const Scalar minusPlus = IsOdd? 1 : -1; - const Scalar Rsum = sqrt((mat(I, J) * mat(I, J) + mat(I, K) * mat(I, K) + mat(J, I) * mat(J, I) + mat(K, I) * mat(K, I)) / 2); + const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2); - res[1] = atan2(Rsum, mat(I, I)); + res[1] = atan2(Rsum, mat(I_, I_)); // There is a singularity when sin(beta) == 0 if(Rsum > 4 * NumTraits::epsilon()) {// sin(beta) != 0 - res[0] = atan2(mat(J, I), minusPlus * mat(K, I)); - res[2] = atan2(mat(I, J), plusMinus * mat(I, K)); + res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_)); + res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_)); } - else if(mat(I, I) > 0) {// sin(beta) == 0 and cos(beta) == 1 - Scalar spos = plusMinus * mat(K, J) + minusPlus * mat(J, K); // 2*sin(alpha + gamma) - Scalar cpos = mat(J, J) + mat(K, K); // 2*cos(alpha + gamma) + else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1 + Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma) + Scalar cpos = mat(J_, J_) + mat(K_, K_); // 2*cos(alpha + gamma) res[0] = atan2(spos, cpos); res[2] = 0; } else {// sin(beta) == 0 and cos(beta) == -1 - Scalar sneg = plusMinus * mat(K, J) + plusMinus * mat(J, K); // 2*sin(alpha - gamma) - Scalar cneg = mat(J, J) - mat(K, K); // 2*cos(alpha - gamma) + Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma) + Scalar cneg = mat(J_, J_) - mat(K_, K_); // 2*cos(alpha - gamma) res[0] = atan2(sneg, cneg); res[2] = 0; } diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 67533e364..4ddb5a2e8 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -72,9 +72,9 @@ void verify_euler(const EulerAngles& e) } } - const Vector3 I = EulerAnglesType::AlphaAxisVector(); - const Vector3 J = EulerAnglesType::BetaAxisVector(); - const Vector3 K = EulerAnglesType::GammaAxisVector(); + const Vector3 I_ = EulerAnglesType::AlphaAxisVector(); + const Vector3 J_ = EulerAnglesType::BetaAxisVector(); + const Vector3 K_ = EulerAnglesType::GammaAxisVector(); // Is approx checks VERIFY(e.isApprox(e)); @@ -97,7 +97,7 @@ void verify_euler(const EulerAngles& e) VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd); VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI); - const Matrix3 mbis(AngleAxisType(ebis.alpha(), I) * AngleAxisType(ebis.beta(), J) * AngleAxisType(ebis.gamma(), K)); + const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_)); VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE); VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix()); /*std::cout << "===================\n" << From 71429883ee41689fd657cdca824459f38ae53423 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 25 Jan 2019 17:00:21 -0800 Subject: [PATCH 176/295] Fix compilation error in NEON GEBP specializaition of madd. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c5d77763a..cc6f3f029 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1031,7 +1031,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { - c += a * b; + c = vfmaq_n_f32(c, a, b); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const From 5a52e35f9afe754a5c817b9faf1102b678b4646a Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 26 Jan 2019 13:18:21 +0100 Subject: [PATCH 177/295] Renaming some more `I` identifiers --- Eigen/src/Core/arch/AVX512/PacketMath.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 4832f2a3b..a2c9599b6 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -263,18 +263,18 @@ EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, } #ifdef EIGEN_VECTORIZE_AVX512DQ -template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I); } -template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I); } +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } #else // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 -template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { - return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I)); +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); } // AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 -template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { - return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I)); +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); } EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { From c9825b967e85df0d893bc899dc019579876b9ce8 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 26 Jan 2019 13:22:13 +0100 Subject: [PATCH 178/295] Renaming even more `I` identifiers --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index bda114751..6706b44ff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -402,25 +402,25 @@ struct OuterReducer { #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) -template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +template +__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); +template +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); +template +__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif -template -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); -template -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); +template +__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif template , template friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) - template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif - template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); - template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif #if defined(EIGEN_USE_SYCL) From 53560f9186b805b3a1e53296aa8b855d3d47e181 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 28 Jan 2019 13:47:28 +0100 Subject: [PATCH 179/295] bug #1672: fix unit test compilation with MSVC by adding overloads of test_is* for long long (and factorize copy/paste code through a macro) --- test/main.h | 62 ++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/test/main.h b/test/main.h index 0b5821cd5..1fe631ca9 100644 --- a/test/main.h +++ b/test/main.h @@ -407,42 +407,29 @@ template<> inline float test_precision >() { return test_pre template<> inline double test_precision >() { return test_precision(); } template<> inline long double test_precision >() { return test_precision(); } -inline bool test_isApprox(const short& a, const short& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isApprox(const unsigned short& a, const unsigned short& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isApprox(const unsigned int& a, const unsigned int& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isApprox(const unsigned long& a, const unsigned long& b) -{ return internal::isApprox(a, b, test_precision()); } +#define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE) \ + inline bool test_isApprox(TYPE a, TYPE b) \ + { return internal::isApprox(a, b, test_precision()); } \ + inline bool test_isMuchSmallerThan(TYPE a, TYPE b) \ + { return internal::isMuchSmallerThan(a, b, test_precision()); } \ + inline bool test_isApproxOrLessThan(TYPE a, TYPE b) \ + { return internal::isApproxOrLessThan(a, b, test_precision()); } -inline bool test_isApprox(const int& a, const int& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isMuchSmallerThan(const int& a, const int& b) -{ return internal::isMuchSmallerThan(a, b, test_precision()); } -inline bool test_isApproxOrLessThan(const int& a, const int& b) -{ return internal::isApproxOrLessThan(a, b, test_precision()); } +EIGEN_TEST_SCALAR_TEST_OVERLOAD(short) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned short) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(int) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned int) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(long) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long) +#if EIGEN_HAS_CXX11 +EIGEN_TEST_SCALAR_TEST_OVERLOAD(long long) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long long) +#endif +EIGEN_TEST_SCALAR_TEST_OVERLOAD(float) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(double) +EIGEN_TEST_SCALAR_TEST_OVERLOAD(half) -inline bool test_isApprox(const long& a, const long& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isMuchSmallerThan(const long& a, const long b) -{ return internal::isMuchSmallerThan(a, b, test_precision()); } -inline bool test_isApproxOrLessThan(const long& a, const long& b) -{ return internal::isApproxOrLessThan(a, b, test_precision()); } - -inline bool test_isApprox(const float& a, const float& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isMuchSmallerThan(const float& a, const float& b) -{ return internal::isMuchSmallerThan(a, b, test_precision()); } -inline bool test_isApproxOrLessThan(const float& a, const float& b) -{ return internal::isApproxOrLessThan(a, b, test_precision()); } - -inline bool test_isApprox(const double& a, const double& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isMuchSmallerThan(const double& a, const double& b) -{ return internal::isMuchSmallerThan(a, b, test_precision()); } -inline bool test_isApproxOrLessThan(const double& a, const double& b) -{ return internal::isApproxOrLessThan(a, b, test_precision()); } +#undef EIGEN_TEST_SCALAR_TEST_OVERLOAD #ifndef EIGEN_TEST_NO_COMPLEX inline bool test_isApprox(const std::complex& a, const std::complex& b) @@ -479,13 +466,6 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b) { return internal::isApproxOrLessThan(a, b, test_precision()); } #endif // EIGEN_TEST_NO_LONGDOUBLE -inline bool test_isApprox(const half& a, const half& b) -{ return internal::isApprox(a, b, test_precision()); } -inline bool test_isMuchSmallerThan(const half& a, const half& b) -{ return internal::isMuchSmallerThan(a, b, test_precision()); } -inline bool test_isApproxOrLessThan(const half& a, const half& b) -{ return internal::isApproxOrLessThan(a, b, test_precision()); } - // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox. template typename NumTraits::NonInteger test_relative_error(const EigenBase &a, const EigenBase &b) From 803fa79767cfbf662be2f0bcd01a3422e65f11ef Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 28 Jan 2019 17:24:44 +0100 Subject: [PATCH 180/295] Move evaluator::find(i,j) to a more general and reusable SparseCompressedBase::lower_bound(i,j) functiion --- Eigen/src/SparseCore/SparseCompressedBase.h | 35 ++++++++++++++------- Eigen/src/SparseCore/SparseUtil.h | 8 +++++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index e0b3c22b6..6a2c7a8ce 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -128,6 +128,28 @@ class SparseCompressedBase protected: /** Default constructor. Do nothing. */ SparseCompressedBase() {} + + /** \internal return the index of the coeff at (row,col) or just before if it does not exist. + * This is an analogue of std::lower_bound. + */ + internal::LowerBoundIndex lower_bound(Index row, Index col) const + { + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = this->outerIndexPtr()[outer]; + Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + internal::LowerBoundIndex p; + p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr(); + p.found = (p.valueinnerIndexPtr()[p.value]==inner); + return p; + } + + friend struct internal::evaluator >; + private: template explicit SparseCompressedBase(const SparseCompressedBase&); }; @@ -333,17 +355,8 @@ protected: Index find(Index row, Index col) const { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = Derived::IsRowMajor ? row : col; - const Index inner = Derived::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); - - return ((pinnerIndexPtr()[p]==inner)) ? p : Dynamic; + internal::LowerBoundIndex p = m_matrix->lower_bound(row,col); + return p.found ? p.value : Dynamic; } const Derived *m_matrix; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 74df0d496..ceb936887 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS template<> struct glue_shapes { typedef SparseSelfAdjointShape type; }; template<> struct glue_shapes { typedef SparseTriangularShape type; }; +// return type of SparseCompressedBase::lower_bound; +struct LowerBoundIndex { + LowerBoundIndex() : value(-1), found(false) {} + LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {} + Index value; + bool found; +}; + } // end namespace internal /** \ingroup SparseCore_Module From f489f445193e21748fbfd304373eaf9b822691e3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 28 Jan 2019 17:29:50 +0100 Subject: [PATCH 181/295] bug #1574: implement "sparse_matrix =,+=,-= diagonal_matrix" with smart insertion strategies of missing diagonal coeffs. --- Eigen/src/SparseCore/CompressedStorage.h | 16 ++++ Eigen/src/SparseCore/SparseAssign.h | 33 +++---- Eigen/src/SparseCore/SparseMatrix.h | 107 +++++++++++++++++++++++ test/sparse_basic.cpp | 24 ++++- 4 files changed, 156 insertions(+), 24 deletions(-) diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index d89fa0dae..acd986fab 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -207,6 +207,22 @@ class CompressedStorage return m_values[id]; } + void moveChunk(Index from, Index to, Index chunkSize) + { + eigen_internal_assert(to+chunkSize <= m_size); + if(to>from && from+chunkSize>to) + { + // move backward + internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + else + { + internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + } + void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { Index k = 0; diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 71452e75e..19a3e8e8b 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -246,35 +246,22 @@ struct Assignment { typedef typename DstXprType::StorageIndex StorageIndex; typedef typename DstXprType::Scalar Scalar; - typedef Array ArrayXI; - typedef Array ArrayXS; - template - static void run(SparseMatrix &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); - Index size = src.diagonal().size(); - dst.makeCompressed(); - dst.resizeNonZeros(size); - Map(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1); - Map(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size)); - Map(dst.valuePtr(), size) = src.diagonal(); - } + template + static void run(SparseMatrix &dst, const SrcXprType &src, const AssignFunc &func) + { dst._assignDiagonal(src.diagonal(), func); } template static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - dst.diagonal() = src.diagonal(); - } + { dst.derived().diagonal() = src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) - { dst.diagonal() += src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.derived().diagonal() += src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) - { dst.diagonal() -= src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.derived().diagonal() -= src.diagonal(); } }; } // end namespace internal diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index eedae47e8..2ff12a4a5 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -502,6 +502,113 @@ class SparseMatrix m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; } } + + /** \internal assign \a diagXpr to the diagonal of \c *this + * There are different strategies: + * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. + * 2 - otherwise, for each diagonal coeff, + * 2.a - if it already exists, then we update it, + * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. + * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. + * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. + * + * TODO: some piece of code could be isolated and reused for a general in-place update strategy. + * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), + * then it *might* be better to disable case 2.b since they will have to be copied anyway. + */ + template + void _assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) + { + struct Record { + Record(Index a_i, Index a_p) : i(a_i), p(a_p) {} + Index i; + Index p; + }; + + Index n = diagXpr.size(); + + const bool overwrite = internal::is_same >::value; + if(overwrite) + { + if((this->rows()!=n) || (this->cols()!=n)) + this->resize(n, n); + } + + if(m_data.size()==0 || overwrite) + { + typedef Array ArrayXI; + this->makeCompressed(); + this->resizeNonZeros(n); + Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); + Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); + Eigen::Map > values = this->coeffs(); + values.setZero(); + internal::call_assignment_no_alias(values, diagXpr, assignFunc); + } + else + { + bool isComp = isCompressed(); + internal::evaluator diaEval(diagXpr); + std::vector newEntries; + + // 1 - try in-place update and record insertion failures + for(Index i = 0; ilower_bound(i,i); + Index p = lb.value; + if(lb.found) + { + // the coeff already exists + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) + { + // non compressed mode with local room for inserting one element + m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); + m_innerNonZeros[i]++; + m_data.value(p) = Scalar(0); + m_data.index(p) = StorageIndex(i); + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else + { + // defer insertion + newEntries.push_back(Record(i,p)); + } + } + // 2 - insert deferred entries + Index n_entries = Index(newEntries.size()); + if(n_entries>0) + { + Storage newData(m_data.size()+n_entries); + Index prev_p = 0; + Index prev_i = 0; + for(Index k=0; k::dummy_precision()) diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 30bba3f07..9e735b38b 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -546,7 +546,7 @@ template void sparse_basic(const SparseMatrixType& re { DenseVector d = DenseVector::Random(rows); DenseMatrix refMat2 = d.asDiagonal(); - SparseMatrixType m2(rows, rows); + SparseMatrixType m2; m2 = d.asDiagonal(); VERIFY_IS_APPROX(m2, refMat2); SparseMatrixType m3(d.asDiagonal()); @@ -554,6 +554,28 @@ template void sparse_basic(const SparseMatrixType& re refMat2 += d.asDiagonal(); m2 += d.asDiagonal(); VERIFY_IS_APPROX(m2, refMat2); + m2.setZero(); m2 += d.asDiagonal(); + refMat2.setZero(); refMat2 += d.asDiagonal(); + VERIFY_IS_APPROX(m2, refMat2); + m2.setZero(); m2 -= d.asDiagonal(); + refMat2.setZero(); refMat2 -= d.asDiagonal(); + VERIFY_IS_APPROX(m2, refMat2); + + initSparse(density, refMat2, m2); + m2.makeCompressed(); + m2 += d.asDiagonal(); + refMat2 += d.asDiagonal(); + VERIFY_IS_APPROX(m2, refMat2); + + initSparse(density, refMat2, m2); + m2.makeCompressed(); + VectorXi res(rows); + for(Index i=0; i(0,3); + m2.reserve(res); + m2 -= d.asDiagonal(); + refMat2 -= d.asDiagonal(); + VERIFY_IS_APPROX(m2, refMat2); } // test conservative resize From a2a07e62b951e6b2c8cb0e8283849c0ef54a5f72 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 Jan 2019 10:10:07 +0100 Subject: [PATCH 182/295] Fix compilation with c++03 (local class cannot be template arguments), and make SparseMatrix::assignDiagonal truly protected. --- Eigen/src/SparseCore/SparseAssign.h | 2 +- Eigen/src/SparseCore/SparseMatrix.h | 216 ++++++++++++++-------------- 2 files changed, 110 insertions(+), 108 deletions(-) diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 19a3e8e8b..905485c88 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -249,7 +249,7 @@ struct Assignment template static void run(SparseMatrix &dst, const SrcXprType &src, const AssignFunc &func) - { dst._assignDiagonal(src.diagonal(), func); } + { dst.assignDiagonal(src.diagonal(), func); } template static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::assign_op &/*func*/) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 2ff12a4a5..63dd1cc32 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -99,6 +99,8 @@ class SparseMatrix typedef SparseCompressedBase Base; using Base::convert_index; friend class SparseVector<_Scalar,0,_StorageIndex>; + template + friend struct internal::Assignment; public: using Base::isCompressed; using Base::nonZeros; @@ -503,113 +505,6 @@ class SparseMatrix } } - /** \internal assign \a diagXpr to the diagonal of \c *this - * There are different strategies: - * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. - * 2 - otherwise, for each diagonal coeff, - * 2.a - if it already exists, then we update it, - * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. - * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. - * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. - * - * TODO: some piece of code could be isolated and reused for a general in-place update strategy. - * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), - * then it *might* be better to disable case 2.b since they will have to be copied anyway. - */ - template - void _assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) - { - struct Record { - Record(Index a_i, Index a_p) : i(a_i), p(a_p) {} - Index i; - Index p; - }; - - Index n = diagXpr.size(); - - const bool overwrite = internal::is_same >::value; - if(overwrite) - { - if((this->rows()!=n) || (this->cols()!=n)) - this->resize(n, n); - } - - if(m_data.size()==0 || overwrite) - { - typedef Array ArrayXI; - this->makeCompressed(); - this->resizeNonZeros(n); - Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); - Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); - Eigen::Map > values = this->coeffs(); - values.setZero(); - internal::call_assignment_no_alias(values, diagXpr, assignFunc); - } - else - { - bool isComp = isCompressed(); - internal::evaluator diaEval(diagXpr); - std::vector newEntries; - - // 1 - try in-place update and record insertion failures - for(Index i = 0; ilower_bound(i,i); - Index p = lb.value; - if(lb.found) - { - // the coeff already exists - assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); - } - else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) - { - // non compressed mode with local room for inserting one element - m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); - m_innerNonZeros[i]++; - m_data.value(p) = Scalar(0); - m_data.index(p) = StorageIndex(i); - assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); - } - else - { - // defer insertion - newEntries.push_back(Record(i,p)); - } - } - // 2 - insert deferred entries - Index n_entries = Index(newEntries.size()); - if(n_entries>0) - { - Storage newData(m_data.size()+n_entries); - Index prev_p = 0; - Index prev_i = 0; - for(Index k=0; k::dummy_precision()) { @@ -1002,6 +897,113 @@ public: m_data.index(p) = convert_index(inner); return (m_data.value(p) = Scalar(0)); } +protected: + struct IndexPosPair { + IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {} + Index i; + Index p; + }; + + /** \internal assign \a diagXpr to the diagonal of \c *this + * There are different strategies: + * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. + * 2 - otherwise, for each diagonal coeff, + * 2.a - if it already exists, then we update it, + * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. + * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. + * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. + * + * TODO: some piece of code could be isolated and reused for a general in-place update strategy. + * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), + * then it *might* be better to disable case 2.b since they will have to be copied anyway. + */ + template + void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) + { + Index n = diagXpr.size(); + + const bool overwrite = internal::is_same >::value; + if(overwrite) + { + if((this->rows()!=n) || (this->cols()!=n)) + this->resize(n, n); + } + + if(m_data.size()==0 || overwrite) + { + typedef Array ArrayXI; + this->makeCompressed(); + this->resizeNonZeros(n); + Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); + Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); + Eigen::Map > values = this->coeffs(); + values.setZero(); + internal::call_assignment_no_alias(values, diagXpr, assignFunc); + } + else + { + bool isComp = isCompressed(); + internal::evaluator diaEval(diagXpr); + std::vector newEntries; + + // 1 - try in-place update and record insertion failures + for(Index i = 0; ilower_bound(i,i); + Index p = lb.value; + if(lb.found) + { + // the coeff already exists + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) + { + // non compressed mode with local room for inserting one element + m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); + m_innerNonZeros[i]++; + m_data.value(p) = Scalar(0); + m_data.index(p) = StorageIndex(i); + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else + { + // defer insertion + newEntries.push_back(IndexPosPair(i,p)); + } + } + // 2 - insert deferred entries + Index n_entries = Index(newEntries.size()); + if(n_entries>0) + { + Storage newData(m_data.size()+n_entries); + Index prev_p = 0; + Index prev_i = 0; + for(Index k=0; k Date: Tue, 29 Jan 2019 10:27:13 +0100 Subject: [PATCH 183/295] bug #1669: fix PartialPivLU/inverse with zero-sized matrices. --- Eigen/src/LU/PartialPivLU.h | 5 ++++- test/inverse.cpp | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 8726bf895..b414b5c46 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -511,7 +511,10 @@ void PartialPivLU::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/test/inverse.cpp b/test/inverse.cpp index 8754cb7e5..99f9e0c9b 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -105,6 +105,22 @@ template void inverse(const MatrixType& m) } } +template +void inverse_zerosized() +{ + Matrix A(0,0); + { + Matrix b, x; + x = A.inverse() * b; + } + { + Matrix b(0,1), x; + x = A.inverse() * b; + VERIFY_IS_EQUAL(x.rows(), 0); + VERIFY_IS_EQUAL(x.cols(), 1); + } +} + EIGEN_DECLARE_TEST(inverse) { int s = 0; @@ -118,6 +134,7 @@ EIGEN_DECLARE_TEST(inverse) s = internal::random(50,320); CALL_SUBTEST_5( inverse(MatrixXf(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) + CALL_SUBTEST_5( inverse_zerosized() ); s = internal::random(25,100); CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) ); From efe02292a6f45a664d2012ee208b41b28b65cb14 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 Jan 2019 11:53:47 +0100 Subject: [PATCH 184/295] Add recent gemm related changesets and various cleanups in perf-monitoring --- bench/perf_monitoring/changesets.txt | 74 ++++++++++--------- bench/perf_monitoring/make_plot.sh | 18 ++++- .../resources/chart_footer.html | 14 ++-- bench/perf_monitoring/run.sh | 5 +- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/bench/perf_monitoring/changesets.txt b/bench/perf_monitoring/changesets.txt index c970386c3..c6b364593 100644 --- a/bench/perf_monitoring/changesets.txt +++ b/bench/perf_monitoring/changesets.txt @@ -10,7 +10,7 @@ 5908:f8ee3c721251 # improve packing with ptranspose #5921:ca808bb456b0 # merge #5927:8b1001f9e3ac -5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks +5937:5a4ca1ad8c53 # New gebp kernel: up to 3 packets x 4 register-level blocks #5949:f3488f4e45b2 # merge #5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec #5992:4a429f5e0483 # merge @@ -24,7 +24,7 @@ before-evaluators #6726:ff2d2388e7b9 # merge default to tensors #6742:0cbd6195e829 # merge default to tensors #6747:853d2bafeb8f # Generalized the gebp apis -6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation +6765:71584fd55762 # Made the blocking computation aware of the l3 cache;
Also optimized the blocking parameters to take
into account the number of threads used for a computation. 6781:9cc5a931b2c6 # generalized gemv 6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product #6844:039efd86b75c # merge tensor @@ -38,50 +38,56 @@ before-evaluators 6933:52572e60b5d3 # blocking size strategy 6937:c8c042f286b2 # avoid redundant pack_rhs 6981:7e5d6f78da59 # dynamic loop swapping -6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache -6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. -7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) -7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables +6984:45f26866c091 # rm dynamic loop swapping,
adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic:
block on the rhs in L1 if the lhs fit in L1. +7013:f875e75f07e5 # organize a little our default cache sizes,
and use a saner default L1 outside of x86 (10% faster on Nexus 5) +7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room
for the possibility of using lookup tables 7016:a58d253e8c91 # Polish lookup tables generation -7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment -7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. -7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth. -7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code +7018:9b27294a8186 # actual_panel_rows computation should always be resilient
to parameters not consistent with the known L1 cache size, see comment +7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5.
Only for float, only for Android on ARM 32bit for now. +7085:627e039fba68 # Bug 986: add support for coefficient-based
product with 0 depth. +7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-SIMD scalar types. 7591:09a8e2186610 # 3.3-alpha1 7650:b0f3c8f43025 # help clang inlining -7708:dfc6ab9d9458 # Improve numerical accuracy in LLT and triangular solve by using true scalar divisions (instead of x * (1/y)) -#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) -8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes -8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path +7708:dfc6ab9d9458 # Improve numerical accuracy in LLT and triangular solve
by using true scalar divisions (instead of x * (1/y)) +#8744:74b789ada92a # Improved the matrix multiplication blocking in the case
where mr is not a power of 2 (e.g on Haswell CPUs) +8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes.
Use numext::mini and numext::maxi instead of
std::min/std::max to compute blocking sizes. +8972:81d53c711775 # Don't optimize the processing of the last rows of
a matrix matrix product in cases that violate
the assumptions made by the optimized code path. 8985:d935df21a082 # Remove the rotating kernel. 8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. -9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators +9148:b8b8c421e36c # Relax mixing-type constraints for binary coeff-wise operators 9174:d228bc282ac9 # merge 9175:abc7a3600098 # Include the cost of stores in unrolling -9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 -9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 +9212:c90098affa7b # Fix perf regression introduced in changeset 8aad8f35c955 +9213:9f1c14e4694b # Fix perf regression in dgemm introduced by changeset 81d53c711775 9361:69d418c06999 # 3.3-beta2 -9445:f27ff0ad77a3 # Optimize expression matching "d?=a-b*c" as "d?=a; d?=b*c;" +9445:f27ff0ad77a3 # Optimize expression matching 'd?=a-b*c' as 'd?=a; d?=b*c;' 9583:bef509908b9d # 3.3-rc1 -9593:2f24280cf59a # Bug 1311: fix alignment logic in some cases of (scalar*small).lazyProduct(small) -9722:040d861b88b5 # Disabled part of the matrix matrix peeling code that's incompatible with 512 bit registers +9593:2f24280cf59a # Bug 1311: fix alignment logic in some cases
of (scalar*small).lazyProduct(small) +9722:040d861b88b5 # Disabled part of the matrix matrix peeling code
that's incompatible with 512 bit registers 9792:26667be4f70b # 3.3.0 -9891:41260bdfc23b # Fix a performance regression in (mat*mat)*vec for which mat*mat was evaluated multiple times. +9891:41260bdfc23b # Fix a performance regression in (mat*mat)*vec
for which mat*mat was evaluated multiple times. 9942:b1d3eba60130 # Operators += and -= do not resize! -9943:79bb9887afd4 # Ease compiler job to generate clean and efficient code in mat*vec -9946:2213991340ea # Complete rewrite of column-major-matrix * vector product to deliver higher performance of modern CPU. -9955:630471c3298c # Improve performance of row-major-dense-matrix * vector products for recent CPUs. (this is the next changeset fixing a typo) +9943:79bb9887afd4 # Ease compiler generating clean and efficient code in mat*vec +9946:2213991340ea # Complete rewrite of column-major-matrix * vector product
to deliver higher performance of modern CPU. +9955:630471c3298c # Improve performance of row-major-dense-matrix * vector products
for recent CPUs. 9975:2eeed9de710c # Revert vec/y to vec*(1/y) in row-major TRSM 10442:e3f17da72a40 # Bug 1435: fix aliasing issue in exressions like: A = C - B*A; -10735:6913f0cf7d06 # Adds missing EIGEN_STRONG_INLINE to support MSVC properly inlining small vector calculations -10943:4db388d946bd # Bug 1562: optimize evaluation of small products of the form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to save a costly temporary. Measured speedup from 2x to 5x. -10961:5007ff66c9f6 # Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it. +10735:6913f0cf7d06 # Adds missing EIGEN_STRONG_INLINE to support MSVC
properly inlining small vector calculations +10943:4db388d946bd # Bug 1562: optimize evaluation of small products
of the form s*A*B by rewriting them as: s*(A.lazyProduct(B))
to save a costly temporary.
Measured speedup from 2x to 5x. +10961:5007ff66c9f6 # Introduce the macro ei_declare_local_nested_eval to
help allocating on the stack local temporaries via alloca,
and let outer-products makes a good use of it. 11083:30a528a984bb # Bug 1578: Improve prefetching in matrix multiplication on MIPS. 11533:71609c41e9f8 # PR 526: Speed up multiplication of small, dynamically sized matrices -11535:6d348dc9b092 # Vectorize row-by-row gebp loop iterations on 16 packets as well -11568:efda481cbd7a # Bug 1624: improve matrix-matrix product on ARM 64, 20% speedup -11596:b8d3f548a9d9 # do not read buffers out of bounds -11628:22f9cc0079bd # Implement AVX512 vectorization of std::complex -11638:81172653b67b # Bug 1515: disable gebp's 3pX4 micro kernel for MSVC<=19.14 because of register spilling. -11659:b500fef42ced # Artificially increase l1-blocking size for AVX512. +10% speedup with current kernels. - +11545:6d348dc9b092 # Vectorize row-by-row gebp loop iterations on 16 packets as well +11579:efda481cbd7a # Bug 1624: improve matrix-matrix product on ARM 64, 20% speedup +11606:b8d3f548a9d9 # do not read buffers out of bounds +11638:22f9cc0079bd # Implement AVX512 vectorization of std::complex +11642:9f52fde03483 # Bug 1636: fix gemm performance issue with gcc>=6 and no FMA +11648:81172653b67b # Bug 1515: disable gebp's 3pX4 micro kernel
for MSVC<=19.14 because of register spilling. +11654:b81188e099f3 # fix EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
for non vectorized type, and non x86/64 target +11664:71546f1a9f0c # enable spilling workaround on architectures with SSE/AVX +11669:b500fef42ced # Artificially increase l1-blocking size for AVX512.
+10% speedup with current kernels. +11683:2ea2960f1c7f # Make code compile again for older compilers. +11753:556fb4ceb654 # Bug: 1633: refactor gebp kernel and optimize for neon +11761:cefc1ba05596 # Bug 1661: fix regression in GEBP and AVX512 +11763:1e41e70fe97b # GEBP: cleanup logic to choose between
a 4 packets of 1 packet (=209bf81aa3f3+fix) \ No newline at end of file diff --git a/bench/perf_monitoring/make_plot.sh b/bench/perf_monitoring/make_plot.sh index ca9fa9662..65aaf66f9 100755 --- a/bench/perf_monitoring/make_plot.sh +++ b/bench/perf_monitoring/make_plot.sh @@ -64,8 +64,11 @@ do i=0 while read line2 do - if [ ! -z '$line2' ]; then - echo '{"r":'$i',"v":'`echo $line2 | cut -f $col -d ' '`'},' >> $WHAT.html + if [ ! -z "$line2" ]; then + val=`echo $line2 | cut -s -f $col -d ' '` + if [ -n "$val" ]; then # skip build failures + echo '{"r":'$i',"v":'$val'},' >> $WHAT.html + fi fi ((i++)) done < $WHAT.out @@ -84,6 +87,17 @@ do done < $WHAT.out echo '];' >> $WHAT.html +echo 'var changesets_details = [' >> $WHAT.html +while read line2 +do + if [ ! -z '$line2' ]; then + num=`echo "$line2" | cut -f 1 -d ' '` + comment=`grep ":$num" changesets.txt | cut -f 2 -d '#'` + echo '"'"$comment"'",' >> $WHAT.html + fi +done < $WHAT.out +echo '];' >> $WHAT.html + echo 'var changesets_count = [' >> $WHAT.html i=0 while read line2 diff --git a/bench/perf_monitoring/resources/chart_footer.html b/bench/perf_monitoring/resources/chart_footer.html index 8acc69f14..e8ef0a270 100644 --- a/bench/perf_monitoring/resources/chart_footer.html +++ b/bench/perf_monitoring/resources/chart_footer.html @@ -14,12 +14,16 @@ .tickFormat(function(d){return changesets[d]}) .rotateLabels(-90); - chart.y(function(datum){ return datum.v; }) - .yAxis.options({ - axisLabel: customSettings.YLABEL || 'GFlops'/*, - tickFormat: function(val){ return d3.format('.0f')(val) + ' GFlops'; }*/ - }); + chart.y(function(datum){ return datum.v; }) + .yAxis.options({ + axisLabel: customSettings.YLABEL || 'GFlops'/*, + tickFormat: function(val){ return d3.format('.0f')(val) + ' GFlops'; }*/ + }); + chart.tooltip.headerFormatter(function(d) { return changesets[d] + + '

' + + changesets_details[d] + "

"; }); + //chart.useInteractiveGuideline(true); d3.select('#chart').datum(data).call(chart); var plot = d3.select('#chart > g'); diff --git a/bench/perf_monitoring/run.sh b/bench/perf_monitoring/run.sh index 4e8f73c7f..3022adfd1 100755 --- a/bench/perf_monitoring/run.sh +++ b/bench/perf_monitoring/run.sh @@ -148,9 +148,10 @@ make_backup $WORKING_DIR_PREFIX"c"$bench cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev do if [ ! -z '$rev' ]; then - echo "Testing rev $rev" + rev2=`echo $rev | cut -f 2 -d':'` + echo "Testing rev $rev, $rev2" cd eigen_src - hg up -C $rev > /dev/null + hg up -C $rev2 > /dev/null actual_rev=`hg identify | cut -f1 -d' '` cd .. From a7779a9b427ffe755fb163e7bfdb1eda4fb1f73e Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 29 Jan 2019 16:48:21 +0100 Subject: [PATCH 185/295] Hide some annoying unused variable warnings in g++8.1 --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 2d3b69128..64dfcd297 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -120,6 +120,7 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; for (int i = static_cast(array_size::value) - 1; i > 0; i--) { @@ -158,6 +159,7 @@ class SimpleTensorContractionMapper { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { From be5b0f664ab1481e74d72e01d4f9172cf927b221 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:48:25 +0100 Subject: [PATCH 186/295] ARM64 & GEBP: Make use of vfmaq_laneq_f32 and workaround GCC's issue in generating good ASM --- .../Core/products/GeneralBlockPanelKernel.h | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cc6f3f029..dea8c94eb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1008,17 +1008,17 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - dest = vld1q_f32(b); + dest = vld1q_f32(b); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const @@ -1034,24 +1034,19 @@ struct gebp_traits c = vfmaq_n_f32(c, a, b); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); + #if EIGEN_COMP_GNUC_STRICT + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif } }; @@ -1260,7 +1255,14 @@ void gebp_kernel); \ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ From 3775926bbae69c23584dd9e6acdbe20ee6ac7050 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:49:06 +0100 Subject: [PATCH 187/295] ARM64 & GEBP: add specialization for double +30% speed up --- .../Core/products/GeneralBlockPanelKernel.h | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index dea8c94eb..c8c3d69cc 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1050,6 +1050,65 @@ struct gebp_traits } }; + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + { + #if EIGEN_COMP_GNUC_STRICT + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + #endif /* optimized General packed Block * packed Panel product kernel From df12fae8b8c0e10f671ffdded241be7a71684ffb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:52:28 +0100 Subject: [PATCH 188/295] According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101, the previous GCC issue is fixed in GCC trunk (will be gcc 9). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8c3d69cc..86767d0e9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1037,7 +1037,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - #if EIGEN_COMP_GNUC_STRICT + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f32 is implemented through a costly dup if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); @@ -1093,7 +1093,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - #if EIGEN_COMP_GNUC_STRICT + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); @@ -1314,7 +1314,7 @@ void gebp_kernel Date: Wed, 30 Jan 2019 13:09:21 +0100 Subject: [PATCH 189/295] Slightly extend discussions on auto and move the content of the Pit falls wiki page here. http://eigen.tuxfamily.org/index.php?title=Pit_Falls --- doc/Pitfalls.dox | 84 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox index 3f395053d..fda402572 100644 --- a/doc/Pitfalls.dox +++ b/doc/Pitfalls.dox @@ -7,14 +7,30 @@ namespace Eigen { See this \link TopicTemplateKeyword page \endlink. + \section TopicPitfalls_aliasing Aliasing Don't miss this \link TopicAliasing page \endlink on aliasing, especially if you got wrong results in statements where the destination appears on the right hand side of the expression. + +\section TopicPitfalls_alignment_issue Alignment Issues (runtime assertion) + +%Eigen does explicit vectorization, and while that is appreciated by many users, that also leads to some issues in special situations where data alignment is compromised. +Indeed, since C++17, C++ does not have quite good enough support for explicit data alignment. +In that case your program hits an assertion failure (that is, a "controlled crash") with a message that tells you to consult this page: +\code +http://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html +\endcode +Have a look at \link TopicUnalignedArrayAssert it \endlink and see for yourself if that's something that you can cope with. +It contains detailed information about how to deal with each known cause for that issue. + +Now what if you don't care about vectorization and so don't want to be annoyed with these alignment issues? Then read \link getrid how to get rid of them \endlink. + + \section TopicPitfalls_auto_keyword C++11 and the auto keyword -In short: do not use the auto keywords with Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a Matrix<> type. Here is an example: +In short: do not use the auto keywords with %Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a \c Matrix<> type. Here is an example: \code MatrixXd A, B; @@ -22,23 +38,81 @@ auto C = A*B; for(...) { ... w = C * v; ...} \endcode -In this example, the type of C is not a MatrixXd but an abstract expression representing a matrix product and storing references to A and B. Therefore, the product of A*B will be carried out multiple times, once per iteration of the for loop. Moreover, if the coefficients of A or B change during the iteration, then C will evaluate to different values. +In this example, the type of C is not a \c MatrixXd but an abstract expression representing a matrix product and storing references to \c A and \c B. +Therefore, the product of \c A*B will be carried out multiple times, once per iteration of the for loop. +Moreover, if the coefficients of A or B change during the iteration, then C will evaluate to different values. Here is another example leading to a segfault: \code auto C = ((A+B).eval()).transpose(); // do something with C \endcode -The problem is that eval() returns a temporary object (in this case a MatrixXd) which is then referenced by the Transpose<> expression. However, this temporary is deleted right after the first line, and there the C expression reference a dead object. The same issue might occur when sub expressions are automatically evaluated by Eigen as in the following example: +The problem is that \c eval() returns a temporary object (in this case a \c MatrixXd) which is then referenced by the \c Transpose<> expression. +However, this temporary is deleted right after the first line, and then the \c C expression references a dead object. +One possible fix consists in applying \c eval() on the whole expression: +\code +auto C = (A+B).transpose().eval(); +\endcode + +The same issue might occur when sub expressions are automatically evaluated by %Eigen as in the following example: \code VectorXd u, v; auto C = u + (A*v).normalized(); // do something with C \endcode -where the normalized() method has to evaluate the expensive product A*v to avoid evaluating it twice. On the other hand, the following example is perfectly fine: +Here the \c normalized() method has to evaluate the expensive product \c A*v to avoid evaluating it twice. +Again, one possible fix is to call \c .eval() on the whole expression: \code auto C = (u + (A*v).normalized()).eval(); \endcode -In this case, C will be a regular VectorXd object. +In this case, \c C will be a regular \c VectorXd object. +Note that DenseBase::eval() is smart enough to avoid copies when the underlying expression is already a plain \c Matrix<>. + + +\section TopicPitfalls_header_issues Header Issues (failure to compile) + +With all libraries, one must check the documentation for which header to include. +The same is true with %Eigen, but slightly worse: with %Eigen, a method in a class may require an additional #include over what the class itself requires! +For example, if you want to use the \c cross() method on a vector (it computes a cross-product) then you need to: +\code +#include +\endcode +We try to always document this, but do tell us if we forgot an occurrence. + + +\section TopicPitfalls_ternary_operator Ternary operator + +In short: avoid the use of the ternary operator (COND ? THEN : ELSE) with %Eigen's expressions for the \c THEN and \c ELSE statements. +To see why, let's consider the following example: +\code +Vector3f A; +A << 1, 2, 3; +Vector3f B = ((1 < 0) ? (A.reverse()) : A); +\endcode +This example will return B = 3, 2, 1. Do you see why? +The reason is that in c++ the type of the \c ELSE statement is inferred from the type of the \c THEN expression such that both match. +Since \c THEN is a Reverse, the \c ELSE statement A is converted to a Reverse, and the compiler thus generates: +\code +Vector3f B = ((1 < 0) ? (A.reverse()) : Reverse(A)); +\endcode +In this very particular case, a workaround would be to call A.reverse().eval() for the \c THEN statement, but the safest and fastest is really to avoid this ternary operator with %Eigen's expressions and use a if/else construct. + + +\section TopicPitfalls_pass_by_value Pass-by-value + +If you don't know why passing-by-value is wrong with %Eigen, read this \link TopicPassingByValue page \endlink first. + +While you may be extremely careful and use care to make sure that all of your code that explicitly uses %Eigen types is pass-by-reference you have to watch out for templates which define the argument types at compile time. + +If a template has a function that takes arguments pass-by-value, and the relevant template parameter ends up being an %Eigen type, then you will of course have the same alignment problems that you would in an explicitly defined function passing %Eigen types by reference. + +Using %Eigen types with other third party libraries or even the STL can present the same problem. +boost::bind for example uses pass-by-value to store arguments in the returned functor. +This will of course be a problem. + +There are at least two ways around this: + - If the value you are passing is guaranteed to be around for the life of the functor, you can use boost::ref() to wrap the value as you pass it to boost::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used. + - The other option is to make your functions take a reference counted pointer like boost::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed. + */ } From d586686924c2783f56bd514c9365afeecc3e84f6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 16:48:01 +0100 Subject: [PATCH 190/295] Workaround lack of support for arbitrary packet-type in Tensor by manually loading half/quarter packets in tensor contraction mapper. --- .../src/Tensor/TensorContractionMapper.h | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 64dfcd297..142492603 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -241,8 +241,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size==packet_size,PacketT>::type + load(Index i, Index j) const + { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -283,6 +285,29 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper(data); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size!=packet_size,PacketT>::type + load(Index i, Index j) const + { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { From de77bf5d6c4fb63a07a7bf7201b26f435d9b19b5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 16:48:20 +0100 Subject: [PATCH 191/295] Fix compilation with ARM64. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 030c7740a..f07746fdb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1053,9 +1053,9 @@ protected: #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON -template -struct gebp_traits - : gebp_traits +template<> +struct gebp_traits + : gebp_traits { typedef float RhsPacket; From 7ef879f6bfa465a80109216e6d0b18266ef97321 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 23:45:12 +0100 Subject: [PATCH 192/295] GEBP: improves pipelining in the 1pX4 path with FMA. Prior to this change, a product with a LHS having 8 rows was faster with AVX-only than with AVX+FMA. With AVX+FMA I measured a speed up of about x1.25 in such cases. --- .../Core/products/GeneralBlockPanelKernel.h | 45 +++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index f07746fdb..e55c2ade8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1313,15 +1313,18 @@ struct lhs_process_one_packet EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) { - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); - traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); - traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); - traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); - traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); - traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); } EIGEN_STRONG_INLINE void operator()( @@ -1350,6 +1353,16 @@ struct lhs_process_one_packet traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + // To improve instruction pipelining, let's double the accumulation registers: + // even k will accumulate in C*, while odd k will accumulate in D*. + // This trick is crutial to get good performance with FMA, otherwise it is + // actually faster to perform separated MUL+ADD because of a naturally + // better instruction-level parallelism. + AccPacket D0, D1, D2, D3; + traits.initAcc(D0); + traits.initAcc(D1); + traits.initAcc(D2); + traits.initAcc(D3); LinearMapper r0 = res.getLinearMapper(i, j2 + 0); LinearMapper r1 = res.getLinearMapper(i, j2 + 1); @@ -1364,7 +1377,7 @@ struct lhs_process_one_packet // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; prefetch(&blB[0]); - LhsPacket A0; + LhsPacket A0, A1; for(Index k=0; k Date: Thu, 31 Jan 2019 14:24:08 -0800 Subject: [PATCH 193/295] Speed up Eigen matrix*vector and vector*matrix multiplication. This change speeds up Eigen matrix * vector and vector * matrix multiplication for dynamic matrices when it is known at runtime that one of the factors is a vector. The benchmarks below test c.noalias()= n_by_n_matrix * n_by_1_matrix; c.noalias()= 1_by_n_matrix * n_by_n_matrix; respectively. Benchmark measurements: SSE: Run on *** (72 X 2992 MHz CPUs); 2019-01-28T17:51:44.452697457-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 1096 312 +71.5% BM_MatVec/128 4581 1464 +68.0% BM_MatVec/256 18534 5710 +69.2% BM_MatVec/512 118083 24162 +79.5% BM_MatVec/1k 704106 173346 +75.4% BM_MatVec/2k 3080828 742728 +75.9% BM_MatVec/4k 25421512 4530117 +82.2% BM_VecMat/32 352 130 +63.1% BM_VecMat/64 1213 425 +65.0% BM_VecMat/128 4640 1564 +66.3% BM_VecMat/256 17902 5884 +67.1% BM_VecMat/512 70466 24000 +65.9% BM_VecMat/1k 340150 161263 +52.6% BM_VecMat/2k 1420590 645576 +54.6% BM_VecMat/4k 8083859 4364327 +46.0% AVX2: Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:45:11.508545307-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 619 120 +80.6% BM_MatVec/128 9693 752 +92.2% BM_MatVec/256 38356 2773 +92.8% BM_MatVec/512 69006 12803 +81.4% BM_MatVec/1k 443810 160378 +63.9% BM_MatVec/2k 2633553 646594 +75.4% BM_MatVec/4k 16211095 4327148 +73.3% BM_VecMat/64 925 227 +75.5% BM_VecMat/128 3438 830 +75.9% BM_VecMat/256 13427 2936 +78.1% BM_VecMat/512 53944 12473 +76.9% BM_VecMat/1k 302264 157076 +48.0% BM_VecMat/2k 1396811 675778 +51.6% BM_VecMat/4k 8962246 4459010 +50.2% AVX512: Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:35:17.239329863-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 401 111 +72.3% BM_MatVec/128 1846 513 +72.2% BM_MatVec/256 36739 1927 +94.8% BM_MatVec/512 54490 9227 +83.1% BM_MatVec/1k 487374 161457 +66.9% BM_MatVec/2k 2016270 643824 +68.1% BM_MatVec/4k 13204300 4077412 +69.1% BM_VecMat/32 324 106 +67.3% BM_VecMat/64 1034 246 +76.2% BM_VecMat/128 3576 802 +77.6% BM_VecMat/256 13411 2561 +80.9% BM_VecMat/512 58686 10037 +82.9% BM_VecMat/1k 320862 163750 +49.0% BM_VecMat/2k 1406719 651397 +53.7% BM_VecMat/4k 7785179 4124677 +47.0% Currently watchingStop watching --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 158 ++++++++++++++---- 1 file changed, 129 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f49abcad5..4bcccd326 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -404,13 +404,13 @@ class gemm_blocking_space -struct generic_product_impl - : generic_product_impl_base > -{ +template 1 || Dest::RowsAtCompileTime > 1), + bool MultipleColsAtCompileTime = + (Rhs::ColsAtCompileTime > 1 || Dest::ColsAtCompileTime > 1)> +struct gemm_selector { typedef typename Product::Scalar Scalar; - typedef typename Lhs::Scalar LhsScalar; - typedef typename Rhs::Scalar RhsScalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -420,10 +420,130 @@ struct generic_product_impl typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename internal::remove_all::type ActualRhsTypeCleaned; + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else if (a_rhs.cols() == 1) { + // matrix * vector. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs, a_rhs.col(0), dst, alpha); + } else { + // vector * matrix. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs.row(0), a_rhs, dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all::type ActualLhsTypeCleaned; + + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else { + // matrix * vector. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs, a_rhs.col(0), dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; + + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else { + // vector * matrix. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs.row(0), a_rhs, dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef + typename internal::remove_all::type ActualLhsTypeCleaned; + + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef + typename internal::remove_all::type ActualRhsTypeCleaned; + enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( + Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) }; + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, + const Scalar& alpha) { + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * + RhsBlasTraits::extractScalarFactor(a_rhs); + typename internal::add_const_on_value_type::type lhs = + LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type::type rhs = + RhsBlasTraits::extract(a_rhs); + typedef internal::gemm_blocking_space< + (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar, + Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, + MaxDepthAtCompileTime> + BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, LhsScalar, + (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, + bool(LhsBlasTraits::NeedToConjugate), RhsScalar, + (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, + bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> + GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || + Dest::MaxRowsAtCompileTime == Dynamic)>( + GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), + a_rhs.cols(), a_lhs.cols(), Dest::Flags & RowMajorBit); + } +}; + +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; typedef generic_product_impl lazyproduct; template @@ -450,7 +570,7 @@ struct generic_product_impl if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else - scaleAndAddTo(dst,lhs, rhs, Scalar(1)); + scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template @@ -469,27 +589,7 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); - - typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, - Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; - - typedef internal::gemm_functor< - Scalar, Index, - internal::general_matrix_matrix_product< - Index, - LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), - RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, - ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; - - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); - internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> - (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); + gemm_selector::run(dst, a_lhs, a_rhs, alpha); } }; From b55b5c7280a0481f01fe5ec764d55c443a8b6496 Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Fri, 1 Feb 2019 15:23:53 -0800 Subject: [PATCH 194/295] Speed up row-major matrix-vector product on ARM The row-major matrix-vector multiplication code uses a threshold to check if processing 8 rows at a time would thrash the cache. This change introduces two modifications to this logic. 1. A smaller threshold for ARM and ARM64 devices. The value of this threshold was determined empirically using a Pixel2 phone, by benchmarking a large number of matrix-vector products in the range [1..4096]x[1..4096] and measuring performance separately on small and little cores with frequency pinning. On big (out-of-order) cores, this change has little to no impact. But on the small (in-order) cores, the matrix-vector products are up to 700% faster. Especially on large matrices. The motivation for this change was some internal code at Google which was using hand-written NEON for implementing similar functionality, processing the matrix one row at a time, which exhibited substantially better performance than Eigen. With the current change, Eigen handily beats that code. 2. Make the logic for choosing number of simultaneous rows apply unifiormly to 8, 4 and 2 rows instead of just 8 rows. Since the default threshold for non-ARM devices is essentially unchanged (32000 -> 32 * 1024), this change has no impact on non-ARM performance. This was verified by running the same set of benchmarks on a Xeon desktop. --- Eigen/src/Core/products/GeneralMatrixVector.h | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 767feb99d..e7dc25478 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -255,11 +255,20 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cj; conj_helper pcj; - // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, - // processing 8 rows at once might be counter productive wrt cache. - const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; - const Index n4 = rows-3; - const Index n2 = rows-1; + // TODO: fine tune the following heuristic. The rationale is that if the + // matrix is very large, processing multiple rows at once might be counter + // productive wrt cache. +#if EIGEN_ARCH_ARM_OR_ARM64 + // This threshold was empirically determined using a Pixel2. + // The little cores are a lot more sensitive to this number + // than the big cores. + const Index cache_threshold = 1024; +#else + const Index cache_threshold = 1024 * 256; +#endif + + const Index row_bytes = lhs.stride() * sizeof(LhsScalar); + const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7); // TODO: for padded aligned inputs, we could enable aligned reads enum { LhsAlignment = Unaligned }; @@ -320,6 +329,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 3); for(; i(ResScalar(0)), @@ -355,6 +367,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 1); for(; i(ResScalar(0)), From 871e2e5339476ae3f7efe63a0156507fd10c73d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 3 Feb 2019 08:54:47 +0100 Subject: [PATCH 195/295] bug #1674: disable GCC's unsafe-math-optimizations in sin/cos vectorization (results are completely wrong otherwise) --- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index ce3f0fc68..693dd55ea 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -319,6 +319,9 @@ inline float trig_reduce_huge (float xf, int *quadrant) template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +#if EIGEN_GNUC_AT_LEAST(4,4) +__attribute__((optimize("-fno-unsafe-math-optimizations"))) +#endif Packet psincos_float(const Packet& _x) { typedef typename unpacket_traits::integer_packet PacketI; From 690b2c45b1101e9661305e6a728ffe2279974fc6 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 10:29:15 -0800 Subject: [PATCH 196/295] Fix GeneralBlockPanelKernel Android compilation --- .../Core/products/GeneralBlockPanelKernel.h | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e55c2ade8..a70a06e57 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1089,8 +1089,21 @@ struct gebp_traits c = vfmaq_n_f32(c, a, b); } + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 @@ -1145,11 +1158,25 @@ struct gebp_traits c = vfmaq_n_f64(c, a, b); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); From 6d0f6265a95757208d089b3c9705281b68fd66b4 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 10:30:25 -0800 Subject: [PATCH 197/295] Remove duplicated comment line --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index a70a06e57..fdd0ec0e9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1176,7 +1176,6 @@ struct gebp_traits { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); From eb21bab769b11546d08f7db0b5bb78bfde6cdbae Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 10:43:16 -0800 Subject: [PATCH 198/295] Parallelize tensor contraction only by sharding dimension and use 'thread-local' memory for packing --- .../src/Tensor/TensorContractionThreadPool.h | 223 ++++++++++++++++-- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 + 2 files changed, 212 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index e06099957..4932514c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -335,6 +335,47 @@ struct TensorEvaluator= device_.numThreadsInPool()) { + parallelize_by_sharding_dim_only_ = true; + + int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_packed_mem_ = device_.allocate(num_blocks * rhs_size); + mem = static_cast(thread_local_packed_mem_); + + thread_local_packed_rhs_.resize(num_blocks, nullptr); + for (Index i = 0; i < num_blocks; ++i) { + thread_local_packed_rhs_[i] = reinterpret_cast(mem); + mem += rhs_size; + } + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_packed_mem_ = device_.allocate(num_blocks * lhs_size); + mem = static_cast(thread_local_packed_mem_); + + thread_local_packed_lhs_.resize(num_blocks, nullptr); + for (Index i = 0; i < num_blocks; ++i) { + thread_local_packed_lhs_[i] = reinterpret_cast(mem); + mem += lhs_size; + } + } + } } ~Context() { @@ -343,6 +384,10 @@ struct TensorEvaluator packed_lhs_[P - 1]; std::vector packed_rhs_[P - 1]; + + // If there is enough concurrency in the sharding dimension, we choose not + // to paralellize by the other dimension, and execute all kernels in sync + // mode. This reduces parallelism from the nm_ x nn_ down to nn_ + // (shard_by_col==true) or nm_ (shard_by_col==false). + bool parallelize_by_sharding_dim_only_ = false; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + void* thread_local_packed_mem_; + + // Only one of these will beinitialized depending on shard_by_col value. + std::vector thread_local_packed_lhs_; + std::vector thread_local_packed_rhs_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + std::atomic** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache @@ -434,22 +515,96 @@ struct TensorEvaluator state_packing_ready_[P]; std::atomic state_switch_[P]; + LhsScalar* packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + + Index base_idx = gm_ * device_.currentThreadId(); + Index grain_idx = m1 - m * gm_; + Index block_idx = base_idx + grain_idx; + + return thread_local_packed_lhs_[block_idx]; + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsScalar* packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + + Index base_idx = gn_ * device_.currentThreadId(); + Index grain_idx = n1 - n * gn_; + Index block_idx = base_idx + grain_idx; + + return thread_local_packed_rhs_[block_idx]; + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, + std::memory_order_relaxed); + } + } + const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1], + TensorContractionKernel::packLhs(packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { + assert(!use_thread_local); signal_packing(k); } else { signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } } void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, + std::memory_order_relaxed); + } + } + const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { if (k == 0) { @@ -462,20 +617,24 @@ struct TensorEvaluator= 0; m--) signal_kernel(m, n, k, m == 0); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } } else { + assert(!use_thread_local); signal_packing(k); } } - void kernel(Index m, Index n, Index k) { + void kernel(Index m, Index n, Index k, bool use_thread_local) { // Note: order of iteration matters here. Iteration over m is innermost // because we want to reuse the same packed rhs in consecutive tasks // (rhs fits into L2$ while lhs only into L3$). @@ -486,8 +645,10 @@ struct TensorEvaluator* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification( + [=]() { kernel(m, n, k, use_thread_local); }); + } } void signal_switch(Index k, Index v = 1) { @@ -589,7 +759,26 @@ struct TensorEvaluator 0 || device_.currentThreadId() < 0); + + if (pack_async) { + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, end, k, rhs); }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index e03735611..fb34cd75e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -122,6 +122,12 @@ struct ThreadPoolDevice { return num_threads_; } + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { + return pool_->NumThreads(); + } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } From 8491127082e5f6568983255a459ca737271aaf3f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 12:59:33 -0800 Subject: [PATCH 199/295] Do not reduce parallelism too much in contractions with small number of threads --- .../src/Tensor/TensorContractionThreadPool.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 4932514c7..4af8d3b18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -339,10 +339,19 @@ struct TensorEvaluator= device_.numThreadsInPool()) { - parallelize_by_sharding_dim_only_ = true; - int num_worker_threads = device_.numThreadsInPool(); + const int num_worker_threads = device_.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. + const int oversharding_factor = + num_worker_threads <= 4 ? 8 : + num_worker_threads <= 8 ? 4 : + num_worker_threads <= 16 ? 2 : 1; + + if (!parallel_pack_ && + sharding_dim_tasks >= oversharding_factor * num_worker_threads) { + parallelize_by_sharding_dim_only_ = true; if (shard_by_col) { can_use_thread_local_packed_ = new std::atomic[nn_]; From b3c4344a6852e55c849976dd46ec4e861399bf16 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Feb 2019 15:21:35 +0100 Subject: [PATCH 200/295] bug #1676: workaround GCC's bug in c++17 mode. --- Eigen/src/Core/DenseBase.h | 4 ++-- Eigen/src/Core/util/ForwardDeclarations.h | 6 +----- bench/bench_gemm.cpp | 5 +++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 65ec1f54b..406e66013 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() { */ template class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN - : public DenseCoeffsBase + : public DenseCoeffsBase::value> #else : public DenseCoeffsBase #endif // not EIGEN_PARSED_BY_DOXYGEN @@ -71,7 +71,7 @@ template class DenseBase typedef Scalar value_type; typedef typename NumTraits::Real RealScalar; - typedef DenseCoeffsBase Base; + typedef DenseCoeffsBase::value> Base; using Base::derived; using Base::const_cast_derived; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 5d86a51ac..050d15e96 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -47,11 +47,7 @@ template struct NumTraits; template struct EigenBase; template class DenseBase; template class PlainObjectBase; - - -template::value > -class DenseCoeffsBase; +template class DenseCoeffsBase; template() .lazyProduct( b.cast() ); if(!r.isApprox(c)) { - std::cout << r - c << "\n"; + std::cout << (r - c).norm() << "\n"; std::cerr << "Warning, your product is crap!\n\n"; } } From fa2fcb4895a4ae12cb28003e646c736d013e68e8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Feb 2019 16:07:08 +0100 Subject: [PATCH 201/295] Backed out changeset 4c0fa6ce0f81ce67dd6723528ddf72f66ae92ba2 --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 174 ++++-------------- 1 file changed, 37 insertions(+), 137 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 4bcccd326..f49abcad5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -404,146 +404,26 @@ class gemm_blocking_space 1 || Dest::RowsAtCompileTime > 1), - bool MultipleColsAtCompileTime = - (Rhs::ColsAtCompileTime > 1 || Dest::ColsAtCompileTime > 1)> -struct gemm_selector { - typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits LhsBlasTraits; - typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; - - typedef internal::blas_traits RhsBlasTraits; - typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else if (a_rhs.cols() == 1) { - // matrix * vector. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs, a_rhs.col(0), dst, alpha); - } else { - // vector * matrix. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs.row(0), a_rhs, dst, alpha); - } - } -}; - -template -struct gemm_selector { - typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits LhsBlasTraits; - typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else { - // matrix * vector. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs, a_rhs.col(0), dst, alpha); - } - } -}; - -template -struct gemm_selector { - typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits RhsBlasTraits; - typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else { - // vector * matrix. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs.row(0), a_rhs, dst, alpha); - } - } -}; - -template -struct gemm_selector { - typedef typename Product::Scalar Scalar; - typedef typename Lhs::Scalar LhsScalar; - typedef typename Rhs::Scalar RhsScalar; - - typedef internal::blas_traits LhsBlasTraits; - typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef - typename internal::remove_all::type ActualLhsTypeCleaned; - - typedef internal::blas_traits RhsBlasTraits; - typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef - typename internal::remove_all::type ActualRhsTypeCleaned; - - enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( - Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) - }; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, - const Scalar& alpha) { - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * - RhsBlasTraits::extractScalarFactor(a_rhs); - typename internal::add_const_on_value_type::type lhs = - LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = - RhsBlasTraits::extract(a_rhs); - typedef internal::gemm_blocking_space< - (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar, - Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, - MaxDepthAtCompileTime> - BlockingType; - - typedef internal::gemm_functor< - Scalar, Index, - internal::general_matrix_matrix_product< - Index, LhsScalar, - (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, - bool(LhsBlasTraits::NeedToConjugate), RhsScalar, - (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, - bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>, - ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> - GemmFunctor; - - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); - internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || - Dest::MaxRowsAtCompileTime == Dynamic)>( - GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), - a_rhs.cols(), a_lhs.cols(), Dest::Flags & RowMajorBit); - } -}; - template struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all::type ActualLhsTypeCleaned; + + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; + + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; + typedef generic_product_impl lazyproduct; template @@ -570,7 +450,7 @@ struct generic_product_impl if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else - scaleAndAddTo(dst, lhs, rhs, Scalar(1)); + scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template @@ -589,7 +469,27 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; - gemm_selector::run(dst, a_lhs, a_rhs, alpha); + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> + (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); } }; From 013cc3a6b39c5962a3261a063d2a4ab4810cb757 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Feb 2019 16:24:09 +0100 Subject: [PATCH 202/295] Make GEMM fallback to GEMV for runtime vectors. This is a more general and simpler version of changeset 4c0fa6ce0f81ce67dd6723528ddf72f66ae92ba2 --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f49abcad5..90c9c4647 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -469,6 +469,20 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + // Fallback to GEMV if either the lhs or rhs is a runtime vector + if (dst.cols() == 1) + { + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); From 59998117bb0e4e0dc4b37b062f02ea5e6aab711e Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 7 Feb 2019 09:21:25 -0800 Subject: [PATCH 203/295] Don't do parallel_pack if we can use thread_local memory in tensor contractions --- .../src/Tensor/TensorContractionThreadPool.h | 55 ++++++++++--------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 4af8d3b18..d7cd995fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -208,6 +208,23 @@ struct TensorEvaluatorm_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. + const int oversharding_factor = + num_worker_threads <= 4 ? 8 : + num_worker_threads <= 8 ? 4 : + num_worker_threads <= 16 ? 2 : 1; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -223,10 +240,13 @@ struct TensorEvaluatorm_device), lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, @@ -275,6 +295,7 @@ struct TensorEvaluator= oversharding_factor * num_worker_threads) { - parallelize_by_sharding_dim_only_ = true; + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); if (shard_by_col) { can_use_thread_local_packed_ = new std::atomic[nn_]; @@ -422,6 +432,7 @@ struct TensorEvaluator packed_lhs_[P - 1]; std::vector packed_rhs_[P - 1]; - // If there is enough concurrency in the sharding dimension, we choose not - // to paralellize by the other dimension, and execute all kernels in sync - // mode. This reduces parallelism from the nm_ x nn_ down to nn_ - // (shard_by_col==true) or nm_ (shard_by_col==false). - bool parallelize_by_sharding_dim_only_ = false; - // If we choose to parallelize only by the sharding dimension, each thread // will have it's own "thead local" (not a c++ thread local storage) memory // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory From 953ca5ba2f007650944017bff423582afeaf0696 Mon Sep 17 00:00:00 2001 From: Steven Peters Date: Fri, 8 Feb 2019 06:23:24 +0000 Subject: [PATCH 204/295] Spline.h: fix spelling "spang" -> "span" --- unsupported/Eigen/src/Splines/Spline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h index c1cf5b7e4..79edd52ce 100644 --- a/unsupported/Eigen/src/Splines/Spline.h +++ b/unsupported/Eigen/src/Splines/Spline.h @@ -191,7 +191,7 @@ namespace Eigen DenseIndex span(Scalar u) const; /** - * \brief Computes the spang within the provided knot vector in which u is falling. + * \brief Computes the span within the provided knot vector in which u is falling. **/ static DenseIndex Span(typename SplineTraits::Scalar u, DenseIndex degree, const typename SplineTraits::KnotVectorType& knots); From 1e36166ed1cd9a2e6fd5a946e2ec418406963a1a Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 8 Feb 2019 15:13:24 -0800 Subject: [PATCH 205/295] Optimize TensorConversion evaluator: do not convert same type --- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 99 +++++++++++++------ 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 1f613d3c7..27c9d4a20 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,7 +32,7 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; - typedef typename TypeConversion::PointerType>::type PointerType; + typedef typename TypeConversion::PointerType>::type PointerType; }; template @@ -177,6 +177,68 @@ template struct ConversionSubExprEval +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal // Eval as rvalue template @@ -191,6 +253,7 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; static const int PacketSize = PacketType::size; + static const bool IsSameType = internal::is_same::value; enum { IsAligned = false, @@ -210,7 +273,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); + return ConversionSubExprEval, Scalar>::run(m_impl, data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() @@ -220,8 +283,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); + return internal::CoeffConv::run(m_impl,index); } template @@ -229,7 +291,8 @@ struct TensorEvaluator, Device> { const bool Vectorizable = TensorEvaluator::PacketAccess & internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); + return internal::PacketConv::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -252,31 +315,7 @@ struct TensorEvaluator, Device> /// required by sycl in order to extract the sycl accessor const TensorEvaluator& impl() const { return m_impl; } - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; - - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; - + protected: TensorEvaluator m_impl; }; From 21eb97d3e07ca3e314f36c1511a3669d7a2f1ed3 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 8 Feb 2019 15:47:25 -0800 Subject: [PATCH 206/295] Add PacketConv implementation for non-vectorizable src expressions --- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 27c9d4a20..938fd0f34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -230,8 +230,21 @@ struct PacketConv { } }; -template -struct PacketConv { +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); + return internal::pload(values); + } +}; + +template +struct PacketConv { template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { return impl.template packet(index); @@ -287,10 +300,17 @@ struct TensorEvaluator, Device> } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType + packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = + IsSameType + ? TensorEvaluator::PacketAccess + : TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast; + return internal::PacketConv::run(m_impl, index); } From ab6e6edc328368889c265a82cbb62f00be1e86ff Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Feb 2019 13:58:24 +0100 Subject: [PATCH 207/295] Speedup PartialPivLU for small matrices by passing compile-time sizes when available. This change set also makes a better use of Map<>+OuterStride and Ref<> yielding surprising speed up for small dynamic sizes as well. The table below reports times in micro seconds for 10 random matrices: | ------ float --------- | ------- double ------- | size | before after ratio | before after ratio | fixed 1 | 0.34 0.11 2.93 | 0.35 0.11 3.06 | fixed 2 | 0.81 0.24 3.38 | 0.91 0.25 3.60 | fixed 3 | 1.49 0.49 3.04 | 1.68 0.55 3.01 | fixed 4 | 2.31 0.70 3.28 | 2.45 1.08 2.27 | fixed 5 | 3.49 1.11 3.13 | 3.84 2.24 1.71 | fixed 6 | 4.76 1.64 2.88 | 4.87 2.84 1.71 | dyn 1 | 0.50 0.40 1.23 | 0.51 0.40 1.26 | dyn 2 | 1.08 0.85 1.27 | 1.04 0.69 1.49 | dyn 3 | 1.76 1.26 1.40 | 1.84 1.14 1.60 | dyn 4 | 2.57 1.75 1.46 | 2.67 1.66 1.60 | dyn 5 | 3.80 2.64 1.43 | 4.00 2.48 1.61 | dyn 6 | 5.06 3.43 1.47 | 5.15 3.21 1.60 | --- Eigen/src/LU/PartialPivLU.h | 39 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index b414b5c46..94c30616a 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -331,17 +331,15 @@ PartialPivLU::PartialPivLU(EigenBase& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template +template struct partial_lu_impl { - // FIXME add a stride to Map, so that the following mapping becomes easier, - // another option would be to create an expression being able to automatically - // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly - // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, - // and Block. - typedef Map > MapLU; - typedef Block MatrixType; - typedef Block BlockType; + static const int UnBlockedBound = 16; + static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + typedef Matrix MatrixType; + typedef Ref MatrixTypeRef; + typedef Ref > BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -354,7 +352,7 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op Scoring; typedef typename Scoring::result_type Score; @@ -417,13 +415,12 @@ struct partial_lu_impl */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); - MatrixType lu(lu1,0,0,rows,cols); + MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(size<=16) + if(UnBlockedAtCompileTime || size<=UnBlockedBound) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -449,12 +446,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0(lu,0,0,rows,k); - BlockType A_2(lu,0,k+bs,rows,tsize); - BlockType A11(lu,k,k,bs,bs); - BlockType A12(lu,k,k+bs,bs,tsize); - BlockType A21(lu,k+bs,k,trows,bs); - BlockType A22(lu,k+bs,k+bs,trows,tsize); + BlockType A_0 = lu.block(0,0,rows,k); + BlockType A_2 = lu.block(0,k+bs,rows,tsize); + BlockType A11 = lu.block(k,k,bs,bs); + BlockType A12 = lu.block(k,k+bs,bs,tsize); + BlockType A21 = lu.block(k+bs,k,trows,bs); + BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -497,7 +494,9 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - + < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, + typename TranspositionType::StorageIndex, + EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } From dada863d2388079a91c413a109a0317a5814d2e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Feb 2019 17:56:20 +0100 Subject: [PATCH 208/295] Enable unit tests of PartialPivLU on fixed size matrices, and increase tested matrix size (blocking was not tested!) --- test/lu.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/lu.cpp b/test/lu.cpp index bb6ae124b..1bbadcbf0 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -110,7 +110,7 @@ template void lu_non_invertible() template void lu_invertible() { /* this test covers the following files: - LU.h + FullPivLU.h */ typedef typename NumTraits::Real RealScalar; Index size = MatrixType::RowsAtCompileTime; @@ -152,13 +152,12 @@ template void lu_invertible() VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4); } -template void lu_partial_piv() +template void lu_partial_piv(Index size = MatrixType::ColsAtCompileTime) { /* this test covers the following files: PartialPivLU.h */ typedef typename NumTraits::Real RealScalar; - Index size = internal::random(1,4); MatrixType m1(size, size), m2(size, size), m3(size, size); m1.setRandom(); @@ -218,9 +217,13 @@ EIGEN_DECLARE_TEST(lu) CALL_SUBTEST_1( lu_non_invertible() ); CALL_SUBTEST_1( lu_invertible() ); CALL_SUBTEST_1( lu_verify_assert() ); + CALL_SUBTEST_1( lu_partial_piv() ); CALL_SUBTEST_2( (lu_non_invertible >()) ); CALL_SUBTEST_2( (lu_verify_assert >()) ); + CALL_SUBTEST_2( lu_partial_piv() ); + CALL_SUBTEST_2( lu_partial_piv() ); + CALL_SUBTEST_2( (lu_partial_piv >()) ); CALL_SUBTEST_3( lu_non_invertible() ); CALL_SUBTEST_3( lu_invertible() ); @@ -228,7 +231,7 @@ EIGEN_DECLARE_TEST(lu) CALL_SUBTEST_4( lu_non_invertible() ); CALL_SUBTEST_4( lu_invertible() ); - CALL_SUBTEST_4( lu_partial_piv() ); + CALL_SUBTEST_4( lu_partial_piv(internal::random(1,EIGEN_TEST_MAX_SIZE)) ); CALL_SUBTEST_4( lu_verify_assert() ); CALL_SUBTEST_5( lu_non_invertible() ); @@ -237,7 +240,7 @@ EIGEN_DECLARE_TEST(lu) CALL_SUBTEST_6( lu_non_invertible() ); CALL_SUBTEST_6( lu_invertible() ); - CALL_SUBTEST_6( lu_partial_piv() ); + CALL_SUBTEST_6( lu_partial_piv(internal::random(1,EIGEN_TEST_MAX_SIZE)) ); CALL_SUBTEST_6( lu_verify_assert() ); CALL_SUBTEST_7(( lu_non_invertible >() )); From eb46f34a8caff181eb0a25e47eda214ede884b1c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Feb 2019 17:59:35 +0100 Subject: [PATCH 209/295] Speed up 2x2 LU by a factor 2, and other small fixed sizes by about 10%. Not sure that's so critical, but this does not complexify the code base much. --- Eigen/src/LU/PartialPivLU.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 94c30616a..12e72880d 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -337,6 +337,9 @@ struct partial_lu_impl static const int UnBlockedBound = 16; static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + // Remaining rows and columns at compile-time: + static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; typedef Matrix MatrixType; typedef Ref MatrixTypeRef; typedef Ref > BlockType; @@ -359,9 +362,12 @@ struct partial_lu_impl const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); + // For small compile-time matrices it is worth processing the last row separately: + // speedup: +100% for 2x2, +10% for others. + const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < size; ++k) + for(Index k = 0; k < endk; ++k) { Index rrows = rows-k-1; Index rcols = cols-k-1; @@ -383,7 +389,7 @@ struct partial_lu_impl // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) // overflow but not the actual quotient? - lu.col(k).tail(rrows) /= lu.coeff(k,k); + lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -393,8 +399,18 @@ struct partial_lu_impl } if(k(rrows),fix(rcols)).noalias() -= lu.col(k).tail(fix(rrows)) * lu.row(k).tail(fix(rcols)); } + + // special handling of the last entry + if(UnBlockedAtCompileTime) + { + Index k = endk; + row_transpositions[k] = PivIndex(k); + if (std::abs(lu(k, k)) == 0 && first_zero_pivot == -1) + first_zero_pivot = k; + } + return first_zero_pivot; } From 2edfc6807d84c3eff5abfeca809eca676c29ed7a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Feb 2019 18:24:07 +0100 Subject: [PATCH 210/295] Fix compilation of empty products of the form: Mx0 * 0xN --- Eigen/src/Core/GeneralProduct.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 43f3b84c8..5bfcd6db8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -239,7 +239,7 @@ template<> struct gemv_dense_selector // on, the other hand it is good for the cache to pack the vector anyways... EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal + MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0) }; typedef const_blas_data_mapper LhsMapper; @@ -326,7 +326,7 @@ template<> struct gemv_dense_selector enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0 }; gemv_static_vector_if static_rhs; From bdcb5f33043c559c7100f8fd5eb55fbbd0cdfc69 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 11 Feb 2019 22:56:19 +0100 Subject: [PATCH 211/295] Let's properly use Score instead of std::abs, and remove deprecated FIXME ( a /= b does a/b and not a * (1/b) as it was a long time ago...) --- Eigen/src/LU/PartialPivLU.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 12e72880d..cba7a0fcf 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -387,8 +387,6 @@ struct partial_lu_impl ++nb_transpositions; } - // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) - // overflow but not the actual quotient? lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) @@ -407,7 +405,7 @@ struct partial_lu_impl { Index k = endk; row_transpositions[k] = PivIndex(k); - if (std::abs(lu(k, k)) == 0 && first_zero_pivot == -1) + if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) first_zero_pivot = k; } From 8c2f30c7900b8e91df6d044431f4b5cc667993c3 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 13 Feb 2019 10:20:53 -0800 Subject: [PATCH 212/295] Speedup Tensor ThreadPool RunQueu::Empty() --- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 73928c1d4..993d9e066 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -148,32 +148,46 @@ class RunQueue { return n; } - // Size returns current queue size. + // Size returns current queue size; if NeedSizeEstimate is false, only whether + // the size is 0 is guaranteed to be correct. // Can be called by any thread at any time. - unsigned Size() const { + template + unsigned SizeOrNotEmpty() const { // Emptiness plays critical role in thread pool blocking. So we go to great // effort to not produce false positives (claim non-empty queue as empty). + unsigned front = front_.load(std::memory_order_acquire); for (;;) { // Capture a consistent snapshot of front/tail. - unsigned front = front_.load(std::memory_order_acquire); unsigned back = back_.load(std::memory_order_acquire); unsigned front1 = front_.load(std::memory_order_relaxed); - if (front != front1) continue; - int size = (front & kMask2) - (back & kMask2); - // Fix overflow. - if (size < 0) size += 2 * kSize; - // Order of modification in push/pop is crafted to make the queue look - // larger than it is during concurrent modifications. E.g. pop can - // decrement size before the corresponding push has incremented it. - // So the computed size can be up to kSize + 1, fix it. - if (size > static_cast(kSize)) size = kSize; - return size; + if (front != front1) { + front = front1; + std::atomic_thread_fence(std::memory_order_acquire); + continue; + } + if (NeedSizeEstimate) { + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (size < 0) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. pop can + // decrement size before the corresponding push has incremented it. + // So the computed size can be up to kSize + 1, fix it. + if (size > kSize) size = kSize; + return size; + } else { + return ((front ^ back) & kMask2); + } } } + // Size returns current queue size. + // Can be called by any thread at any time. + unsigned Size() const { return SizeOrNotEmpty(); } + // Empty tests whether container is empty. // Can be called by any thread at any time. - bool Empty() const { return Size() == 0; } + bool Empty() const { return SizeOrNotEmpty() == 0; } // Delete all the elements from the queue. void Flush() { From 106ba7bb1a042f7d02bafad7c820da6955d66e09 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 14 Feb 2019 09:51:51 -0800 Subject: [PATCH 213/295] Do not generate no-op cast() and conjugate() expressions --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 7d9afa685..dd008fe05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -244,9 +244,11 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived>::type conjugate() const { - return unaryExpr(internal::scalar_conjugate_op()); + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); } EIGEN_DEVICE_FUNC @@ -339,10 +341,13 @@ class TensorBase return cwiseMin(constant(threshold)); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::value, + Derived, + TensorConversionOp >::type cast() const { - return TensorConversionOp(derived()); + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); } EIGEN_DEVICE_FUNC @@ -628,26 +633,26 @@ class TensorBase } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > all(const Dims& dims) const { return cast().reduce(dims, internal::AndReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > all() const { DimensionList in_dims; return cast().reduce(in_dims, internal::AndReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > + const TensorReductionOp::value, Derived, TensorConversionOp >::type > any(const Dims& dims) const { return cast().reduce(dims, internal::OrReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > any() const { DimensionList in_dims; return cast().reduce(in_dims, internal::OrReducer()); From f0d42d22656486812c1621b2426256765e1924fb Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 14 Feb 2019 10:27:28 -0800 Subject: [PATCH 214/295] Fix signed-unsigned comparison warning in RunQueue --- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 74 ++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 993d9e066..7a3be25e7 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -148,39 +148,6 @@ class RunQueue { return n; } - // Size returns current queue size; if NeedSizeEstimate is false, only whether - // the size is 0 is guaranteed to be correct. - // Can be called by any thread at any time. - template - unsigned SizeOrNotEmpty() const { - // Emptiness plays critical role in thread pool blocking. So we go to great - // effort to not produce false positives (claim non-empty queue as empty). - unsigned front = front_.load(std::memory_order_acquire); - for (;;) { - // Capture a consistent snapshot of front/tail. - unsigned back = back_.load(std::memory_order_acquire); - unsigned front1 = front_.load(std::memory_order_relaxed); - if (front != front1) { - front = front1; - std::atomic_thread_fence(std::memory_order_acquire); - continue; - } - if (NeedSizeEstimate) { - int size = (front & kMask2) - (back & kMask2); - // Fix overflow. - if (size < 0) size += 2 * kSize; - // Order of modification in push/pop is crafted to make the queue look - // larger than it is during concurrent modifications. E.g. pop can - // decrement size before the corresponding push has incremented it. - // So the computed size can be up to kSize + 1, fix it. - if (size > kSize) size = kSize; - return size; - } else { - return ((front ^ back) & kMask2); - } - } - } - // Size returns current queue size. // Can be called by any thread at any time. unsigned Size() const { return SizeOrNotEmpty(); } @@ -220,6 +187,47 @@ class RunQueue { std::atomic back_; Elem array_[kSize]; + // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, + // only whether the size is 0 is guaranteed to be correct. + // Can be called by any thread at any time. + template + unsigned SizeOrNotEmpty() const { + // Emptiness plays critical role in thread pool blocking. So we go to great + // effort to not produce false positives (claim non-empty queue as empty). + unsigned front = front_.load(std::memory_order_acquire); + for (;;) { + // Capture a consistent snapshot of front/tail. + unsigned back = back_.load(std::memory_order_acquire); + unsigned front1 = front_.load(std::memory_order_relaxed); + if (front != front1) { + front = front1; + std::atomic_thread_fence(std::memory_order_acquire); + continue; + } + if (NeedSizeEstimate) { + return CalculateSize(front, back); + } else { + // This value will be 0 if the queue is empty, and undefined otherwise. + int maybe_zero = ((front ^ back) & kMask2); + eigen_assert(maybe_zero == 0 ? CalculateSize(front, back) == 0 : true); + return maybe_zero; + } + } + } + + EIGEN_ALWAYS_INLINE + unsigned CalculateSize(unsigned front, unsigned back) const { + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (size < 0) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. push can + // increment size before the corresponding pop has decremented it. + // So the computed size can be up to kSize + 1, fix it. + if (size > static_cast(kSize)) size = kSize; + return size; + } + RunQueue(const RunQueue&) = delete; void operator=(const RunQueue&) = delete; }; From 7b837559a76171a6cd2b9341fdfaec75f8aaf6c3 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 14 Feb 2019 10:40:21 -0800 Subject: [PATCH 215/295] Fix signed-unsigned return in RuqQueue --- unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 7a3be25e7..ecdc35f81 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -208,7 +208,7 @@ class RunQueue { return CalculateSize(front, back); } else { // This value will be 0 if the queue is empty, and undefined otherwise. - int maybe_zero = ((front ^ back) & kMask2); + unsigned maybe_zero = ((front ^ back) & kMask2); eigen_assert(maybe_zero == 0 ? CalculateSize(front, back) == 0 : true); return maybe_zero; } @@ -225,7 +225,7 @@ class RunQueue { // increment size before the corresponding pop has decremented it. // So the computed size can be up to kSize + 1, fix it. if (size > static_cast(kSize)) size = kSize; - return size; + return static_cast(size); } RunQueue(const RunQueue&) = delete; From 65e23ca7e93b6836cb197adcb1e832ae94203b35 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Feb 2019 13:46:13 -0800 Subject: [PATCH 216/295] Revert https://bitbucket.org/eigen/eigen/commits/b55b5c7280a0481f01fe5ec764d55c443a8b6496 . --- Eigen/src/Core/products/GeneralMatrixVector.h | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index e7dc25478..767feb99d 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -255,20 +255,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cj; conj_helper pcj; - // TODO: fine tune the following heuristic. The rationale is that if the - // matrix is very large, processing multiple rows at once might be counter - // productive wrt cache. -#if EIGEN_ARCH_ARM_OR_ARM64 - // This threshold was empirically determined using a Pixel2. - // The little cores are a lot more sensitive to this number - // than the big cores. - const Index cache_threshold = 1024; -#else - const Index cache_threshold = 1024 * 256; -#endif - - const Index row_bytes = lhs.stride() * sizeof(LhsScalar); - const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7); + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; // TODO: for padded aligned inputs, we could enable aligned reads enum { LhsAlignment = Unaligned }; @@ -329,9 +320,6 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 3); for(; i(ResScalar(0)), @@ -367,9 +355,6 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 1); for(; i(ResScalar(0)), From f2970819a26bcc5370c88838c740d507583d9184 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 09:39:25 +0100 Subject: [PATCH 217/295] bug #1679: avoid possible division by 0 in complex-schur --- Eigen/src/Eigenvalues/ComplexSchur.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index b8b3490c6..fc71468f8 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h @@ -300,10 +300,13 @@ typename ComplexSchur::ComplexScalar ComplexSchur::compu ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1); ComplexScalar eival1 = (trace + disc) / RealScalar(2); ComplexScalar eival2 = (trace - disc) / RealScalar(2); - - if(numext::norm1(eival1) > numext::norm1(eival2)) + RealScalar eival1_norm = numext::norm1(eival1); + RealScalar eival2_norm = numext::norm1(eival2); + // A division by zero can only occur if eival1==eival2==0. + // In this case, det==0, and all we have to do is checking that eival2_norm!=0 + if(eival1_norm > eival2_norm) eival2 = det / eival1; - else + else if(eival2_norm!=RealScalar(0)) eival1 = det / eival2; // choose the eigenvalue closest to the bottom entry of the diagonal From d85ae650bf0baa55956e11489888f62b60530e70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 10:24:17 +0100 Subject: [PATCH 218/295] bug #1678: workaround MSVC compilation issues with AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 2199970ad..3842f576b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -141,7 +141,7 @@ EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { } template <> EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { - return _mm512_broadcastsd_pd(_mm_load_pd1(from)); + return _mm512_set1_pd(*from); } template <> @@ -167,6 +167,11 @@ EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, const Packet8d& b) { return _mm512_add_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, + const Packet16i& b) { + return _mm512_add_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, @@ -178,6 +183,11 @@ EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, const Packet8d& b) { return _mm512_sub_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, + const Packet16i& b) { + return _mm512_sub_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { @@ -211,6 +221,11 @@ EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, const Packet8d& b) { return _mm512_mul_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, + const Packet16i& b) { + return _mm512_mul_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, @@ -522,10 +537,8 @@ EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { // {a0, a0 a0, a0, a1, a1, a1, a1} template <> EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - __m128d tmp0 = _mm_load_pd1(from); - __m256d lane0 = _mm256_broadcastsd_pd(tmp0); - __m128d tmp1 = _mm_load_pd1(from + 1); - __m256d lane1 = _mm256_broadcastsd_pd(tmp1); + __m256d lane0 = _mm256_set1_pd(*from); + __m256d lane1 = _mm256_set1_pd(*(from+1)); __m512d tmp = _mm512_undefined_pd(); tmp = _mm512_insertf64x4(tmp, lane0, 0); return _mm512_insertf64x4(tmp, lane1, 1); From 559320745e87984fd863b535932105177e5ad795 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 10:30:28 +0100 Subject: [PATCH 219/295] bug #1678: Fix lack of __FMA__ macro on MSVC with AVX512 --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 68765d4b2..b00d8b038 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -261,7 +261,7 @@ #define EIGEN_VECTORIZE_FMA #endif #if defined(__AVX512F__) - #ifndef __FMA__ + #ifndef EIGEN_VECTORIZE_FMA #if EIGEN_COMP_GNUC #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). #else From 0505248f250f0d1665d51e2e8f0775531ce40382 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 16:33:56 +0100 Subject: [PATCH 220/295] bug #1680: make all "block" methods strong-inline and device-functions (some were missing EIGEN_DEVICE_FUNC) --- Eigen/src/plugins/BlockMethods.h | 337 ++++++++++++++++--------------- 1 file changed, 175 insertions(+), 162 deletions(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index ef620ab7a..935a604b6 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -87,11 +87,11 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, fix, fix(int) /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename FixedBlockXpr<...,...>::Type +typename FixedBlockXpr<...,...>::Type #endif block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) { @@ -101,11 +101,11 @@ block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) /// This is the const version of block(Index,Index,NRowsType,NColsType) template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline const typename ConstFixedBlockXpr<...,...>::Type +const typename ConstFixedBlockXpr<...,...>::Type #endif block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const { @@ -133,11 +133,11 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename FixedBlockXpr<...,...>::Type +typename FixedBlockXpr<...,...>::Type #endif topRightCorner(NRowsType cRows, NColsType cCols) { @@ -147,11 +147,11 @@ topRightCorner(NRowsType cRows, NColsType cCols) /// This is the const version of topRightCorner(NRowsType, NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline const typename ConstFixedBlockXpr<...,...>::Type +const typename ConstFixedBlockXpr<...,...>::Type #endif topRightCorner(NRowsType cRows, NColsType cCols) const { @@ -172,16 +172,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner() { return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /// This is the const version of topRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } @@ -206,14 +206,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /// This is the const version of topRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } @@ -238,11 +240,11 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename FixedBlockXpr<...,...>::Type +typename FixedBlockXpr<...,...>::Type #endif topLeftCorner(NRowsType cRows, NColsType cCols) { @@ -252,11 +254,11 @@ topLeftCorner(NRowsType cRows, NColsType cCols) /// This is the const version of topLeftCorner(Index, Index). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline const typename ConstFixedBlockXpr<...,...>::Type +const typename ConstFixedBlockXpr<...,...>::Type #endif topLeftCorner(NRowsType cRows, NColsType cCols) const { @@ -276,16 +278,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner() { return typename FixedBlockXpr::Type(derived(), 0, 0); } /// This is the const version of topLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } @@ -310,14 +312,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } @@ -342,11 +346,11 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename FixedBlockXpr<...,...>::Type +typename FixedBlockXpr<...,...>::Type #endif bottomRightCorner(NRowsType cRows, NColsType cCols) { @@ -357,11 +361,11 @@ bottomRightCorner(NRowsType cRows, NColsType cCols) /// This is the const version of bottomRightCorner(NRowsType, NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline const typename ConstFixedBlockXpr<...,...>::Type +const typename ConstFixedBlockXpr<...,...>::Type #endif bottomRightCorner(NRowsType cRows, NColsType cCols) const { @@ -382,16 +386,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /// This is the const version of bottomRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } @@ -416,14 +420,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /// This is the const version of bottomRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } @@ -448,11 +454,11 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename FixedBlockXpr<...,...>::Type +typename FixedBlockXpr<...,...>::Type #endif bottomLeftCorner(NRowsType cRows, NColsType cCols) { @@ -463,11 +469,11 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols) /// This is the const version of bottomLeftCorner(NRowsType, NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else -inline typename ConstFixedBlockXpr<...,...>::Type +typename ConstFixedBlockXpr<...,...>::Type #endif bottomLeftCorner(NRowsType cRows, NColsType cCols) const { @@ -488,16 +494,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /// This is the const version of bottomLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } @@ -522,14 +528,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /// This is the const version of bottomLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } @@ -553,11 +561,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NRowsBlockXpr::value>::Type +typename NRowsBlockXpr::value>::Type #else -inline typename NRowsBlockXpr<...>::Type +typename NRowsBlockXpr<...>::Type #endif topRows(NRowsType n) { @@ -567,11 +575,11 @@ topRows(NRowsType n) /// This is the const version of topRows(NRowsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNRowsBlockXpr::value>::Type +const typename ConstNRowsBlockXpr::value>::Type #else -inline const typename ConstNRowsBlockXpr<...>::Type +const typename ConstNRowsBlockXpr<...>::Type #endif topRows(NRowsType n) const { @@ -595,16 +603,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type topRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type topRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } /// This is the const version of topRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type topRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } @@ -628,11 +636,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NRowsBlockXpr::value>::Type +typename NRowsBlockXpr::value>::Type #else -inline typename NRowsBlockXpr<...>::Type +typename NRowsBlockXpr<...>::Type #endif bottomRows(NRowsType n) { @@ -642,11 +650,11 @@ bottomRows(NRowsType n) /// This is the const version of bottomRows(NRowsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNRowsBlockXpr::value>::Type +const typename ConstNRowsBlockXpr::value>::Type #else -inline const typename ConstNRowsBlockXpr<...>::Type +const typename ConstNRowsBlockXpr<...>::Type #endif bottomRows(NRowsType n) const { @@ -670,16 +678,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type bottomRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type bottomRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } /// This is the const version of bottomRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } @@ -704,11 +712,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NRowsBlockXpr::value>::Type +typename NRowsBlockXpr::value>::Type #else -inline typename NRowsBlockXpr<...>::Type +typename NRowsBlockXpr<...>::Type #endif middleRows(Index startRow, NRowsType n) { @@ -718,11 +726,11 @@ middleRows(Index startRow, NRowsType n) /// This is the const version of middleRows(Index,NRowsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNRowsBlockXpr::value>::Type +const typename ConstNRowsBlockXpr::value>::Type #else -inline const typename ConstNRowsBlockXpr<...>::Type +const typename ConstNRowsBlockXpr<...>::Type #endif middleRows(Index startRow, NRowsType n) const { @@ -747,16 +755,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) { return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } /// This is the const version of middleRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } @@ -780,11 +788,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NColsBlockXpr::value>::Type +typename NColsBlockXpr::value>::Type #else -inline typename NColsBlockXpr<...>::Type +typename NColsBlockXpr<...>::Type #endif leftCols(NColsType n) { @@ -794,11 +802,11 @@ leftCols(NColsType n) /// This is the const version of leftCols(NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNColsBlockXpr::value>::Type +const typename ConstNColsBlockXpr::value>::Type #else -inline const typename ConstNColsBlockXpr<...>::Type +const typename ConstNColsBlockXpr<...>::Type #endif leftCols(NColsType n) const { @@ -822,16 +830,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type leftCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type leftCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); } /// This is the const version of leftCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type leftCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); } @@ -855,11 +863,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NColsBlockXpr::value>::Type +typename NColsBlockXpr::value>::Type #else -inline typename NColsBlockXpr<...>::Type +typename NColsBlockXpr<...>::Type #endif rightCols(NColsType n) { @@ -869,11 +877,11 @@ rightCols(NColsType n) /// This is the const version of rightCols(NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNColsBlockXpr::value>::Type +const typename ConstNColsBlockXpr::value>::Type #else -inline const typename ConstNColsBlockXpr<...>::Type +const typename ConstNColsBlockXpr<...>::Type #endif rightCols(NColsType n) const { @@ -897,16 +905,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type rightCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type rightCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } /// This is the const version of rightCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type rightCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } @@ -931,11 +939,11 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NColsBlockXpr::value>::Type +typename NColsBlockXpr::value>::Type #else -inline typename NColsBlockXpr<...>::Type +typename NColsBlockXpr<...>::Type #endif middleCols(Index startCol, NColsType numCols) { @@ -945,11 +953,11 @@ middleCols(Index startCol, NColsType numCols) /// This is the const version of middleCols(Index,NColsType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstNColsBlockXpr::value>::Type +const typename ConstNColsBlockXpr::value>::Type #else -inline const typename ConstNColsBlockXpr<...>::Type +const typename ConstNColsBlockXpr<...>::Type #endif middleCols(Index startCol, NColsType numCols) const { @@ -974,16 +982,16 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } /// This is the const version of middleCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } @@ -1015,16 +1023,16 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol) { return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /// This is the const version of block<>(Index, Index). */ template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } @@ -1061,8 +1069,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -1070,7 +1078,8 @@ inline typename FixedBlockXpr::Type block(Index startRow, Index sta /// This is the const version of block<>(Index, Index, Index, Index). template -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -1084,15 +1093,15 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /** * \sa row(), class Block */ -EIGEN_DEVICE_FUNC -inline ColXpr col(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ColXpr col(Index i) { return ColXpr(derived(), i); } /// This is the const version of col(). -EIGEN_DEVICE_FUNC -inline ConstColXpr col(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); } @@ -1105,15 +1114,15 @@ inline ConstColXpr col(Index i) const EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /** * \sa col(), class Block */ -EIGEN_DEVICE_FUNC -inline RowXpr row(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RowXpr row(Index i) { return RowXpr(derived(), i); } /// This is the const version of row(). */ -EIGEN_DEVICE_FUNC -inline ConstRowXpr row(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); } @@ -1140,11 +1149,11 @@ inline ConstRowXpr row(Index i) const /// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedSegmentReturnType::value>::Type +typename FixedSegmentReturnType::value>::Type #else -inline typename FixedSegmentReturnType<...>::Type +typename FixedSegmentReturnType<...>::Type #endif segment(Index start, NType n) { @@ -1156,11 +1165,11 @@ segment(Index start, NType n) /// This is the const version of segment(Index,NType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedSegmentReturnType::value>::Type +const typename ConstFixedSegmentReturnType::value>::Type #else -inline const typename ConstFixedSegmentReturnType<...>::Type +const typename ConstFixedSegmentReturnType<...>::Type #endif segment(Index start, NType n) const { @@ -1190,11 +1199,11 @@ segment(Index start, NType n) const /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedSegmentReturnType::value>::Type +typename FixedSegmentReturnType::value>::Type #else -inline typename FixedSegmentReturnType<...>::Type +typename FixedSegmentReturnType<...>::Type #endif head(NType n) { @@ -1205,11 +1214,11 @@ head(NType n) /// This is the const version of head(NType). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedSegmentReturnType::value>::Type +const typename ConstFixedSegmentReturnType::value>::Type #else -inline const typename ConstFixedSegmentReturnType<...>::Type +const typename ConstFixedSegmentReturnType<...>::Type #endif head(NType n) const { @@ -1239,11 +1248,11 @@ head(NType n) const /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename FixedSegmentReturnType::value>::Type +typename FixedSegmentReturnType::value>::Type #else -inline typename FixedSegmentReturnType<...>::Type +typename FixedSegmentReturnType<...>::Type #endif tail(NType n) { @@ -1254,11 +1263,11 @@ tail(NType n) /// This is the const version of tail(Index). template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE #ifndef EIGEN_PARSED_BY_DOXYGEN -inline const typename ConstFixedSegmentReturnType::value>::Type +const typename ConstFixedSegmentReturnType::value>::Type #else -inline const typename ConstFixedSegmentReturnType<...>::Type +const typename ConstFixedSegmentReturnType<...>::Type #endif tail(NType n) const { @@ -1284,8 +1293,8 @@ tail(NType n) const /// \sa segment(Index,NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type segment(Index start, Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), start, n); @@ -1293,8 +1302,8 @@ inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N /// This is the const version of segment(Index). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), start, n); @@ -1316,8 +1325,8 @@ inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index /// \sa head(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type head(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type head(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), 0, n); @@ -1325,8 +1334,8 @@ inline typename FixedSegmentReturnType::Type head(Index n = N) /// This is the const version of head(). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type head(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); @@ -1348,8 +1357,8 @@ inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const /// \sa tail(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type tail(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type tail(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), size() - n); @@ -1357,8 +1366,8 @@ inline typename FixedSegmentReturnType::Type tail(Index n = N) /// This is the const version of tail. template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type tail(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); @@ -1367,18 +1376,21 @@ inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this /// is col-major (resp. row-major). /// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) { return InnerVectorReturnType(derived(), outer); } /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this /// is col-major (resp. row-major). Read-only. /// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const { return ConstInnerVectorReturnType(derived(), outer); } /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this /// is col-major (resp. row-major). /// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) { @@ -1391,6 +1403,7 @@ innerVectors(Index outerStart, Index outerSize) /// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this /// is col-major (resp. row-major). Read-only. /// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const { @@ -1404,7 +1417,7 @@ innerVectors(Index outerStart, Index outerSize) const * \sa subVectors() */ template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::conditional::type subVector(Index i) { @@ -1413,7 +1426,7 @@ subVector(Index i) /** This is the const version of subVector(Index) */ template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::conditional::type subVector(Index i) const { @@ -1424,7 +1437,7 @@ subVector(Index i) const * \sa subVector(Index) */ template -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index subVectors() const { return (Direction==Vertical)?cols():rows(); } From 83309068b42c85d919aff53eb3652fa7896ed020 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 16:35:35 +0100 Subject: [PATCH 221/295] bug #1680: improve MSVC inlining by declaring many triavial constructors and accessors as STRONG_INLINE. --- Eigen/src/Core/AssignEvaluator.h | 3 +- Eigen/src/Core/Block.h | 60 ++++++++++---------- Eigen/src/Core/CoreEvaluators.h | 91 +++++++++++++++++++----------- Eigen/src/Core/CwiseBinaryOp.h | 22 +++++--- Eigen/src/Core/DenseBase.h | 4 +- Eigen/src/Core/GeneralProduct.h | 7 ++- Eigen/src/Core/Matrix.h | 18 +++--- Eigen/src/Core/PlainObjectBase.h | 6 +- Eigen/src/Core/Product.h | 15 +++-- Eigen/src/Core/ProductEvaluators.h | 8 ++- Eigen/src/Core/Redux.h | 6 +- Eigen/src/Core/Swap.h | 9 +-- Eigen/src/Core/Transpose.h | 40 +++++++------ Eigen/src/Core/VectorBlock.h | 8 +-- Eigen/src/Core/util/Macros.h | 6 +- Eigen/src/Geometry/OrthoMethods.h | 5 +- 16 files changed, 181 insertions(+), 127 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 79575e1b4..229e25854 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -611,7 +611,8 @@ public: typedef typename AssignmentTraits::PacketType PacketType; - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) { #ifdef EIGEN_DEBUG_ASSIGN diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 11de45c2e..6e938ea58 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -114,8 +114,8 @@ template class /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index i) : Impl(xpr,i) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index i) : Impl(xpr,i) { eigen_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i class /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) @@ -135,8 +135,8 @@ template class /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) @@ -159,10 +159,10 @@ class BlockImpl public: typedef Impl Base; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} EIGEN_DEVICE_FUNC - inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) {} }; @@ -294,22 +294,22 @@ template::type& nestedExpression() const { return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex startRow() const { return m_startRow.value(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex startCol() const { return m_startCol.value(); @@ -342,8 +342,8 @@ class BlockImpl_dense /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index i) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index i) : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), @@ -357,8 +357,8 @@ class BlockImpl_dense /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -367,8 +367,8 @@ class BlockImpl_dense /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), @@ -377,18 +377,18 @@ class BlockImpl_dense init(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index innerStride() const { return internal::traits::HasSameStorageOrderAsXprType ? m_xpr.innerStride() @@ -396,19 +396,19 @@ class BlockImpl_dense } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return m_outerStride; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex startRow() const { return m_startRow.value(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex startCol() const { return m_startCol.value(); @@ -422,8 +422,8 @@ class BlockImpl_dense #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal used by allowAligned() */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) : Base(data, blockRows, blockCols), m_xpr(xpr) { init(); @@ -431,7 +431,7 @@ class BlockImpl_dense #endif protected: - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void init() { m_outerStride = internal::traits::HasSameStorageOrderAsXprType diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index d5da5cdec..670fa77b5 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -90,7 +90,8 @@ template struct evaluator : public unary_evaluator { typedef unary_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const T& xpr) : Base(xpr) {} }; @@ -99,7 +100,7 @@ template struct evaluator : evaluator { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator(xpr) {} }; @@ -134,21 +135,25 @@ private: // this helper permits to completely eliminate m_outerStride if it is known at compiletime. template class plainobjectbase_evaluator_data { public: - EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { #ifndef EIGEN_INTERNAL_DEBUGGING EIGEN_UNUSED_VARIABLE(outerStride); #endif eigen_internal_assert(outerStride==OuterStride); } - EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return OuterStride; } const Scalar *data; }; template class plainobjectbase_evaluator_data { public: - EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return m_outerStride; } const Scalar *data; protected: Index m_outerStride; @@ -179,13 +184,15 @@ struct evaluator > : RowsAtCompileTime }; - EIGEN_DEVICE_FUNC evaluator() + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() : m_d(0,OuterStrideAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const PlainObjectType& m) : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -268,9 +275,11 @@ struct evaluator > { typedef Matrix XprType; - EIGEN_DEVICE_FUNC evaluator() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) : evaluator >(m) { } }; @@ -281,9 +290,11 @@ struct evaluator > { typedef Array XprType; - EIGEN_DEVICE_FUNC evaluator() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) : evaluator >(m) { } }; @@ -302,7 +313,8 @@ struct unary_evaluator, IndexBased> Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -712,7 +724,8 @@ struct evaluator > typedef CwiseBinaryOp XprType; typedef binary_evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& xpr) : Base(xpr) {} }; template @@ -740,7 +753,8 @@ struct binary_evaluator, IndexBased, IndexBase Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) : m_d(xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit binary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -877,7 +891,8 @@ struct mapbase_evaluator : evaluator_base CoeffReadCost = NumTraits::ReadCost }; - EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit mapbase_evaluator(const XprType& map) : m_data(const_cast(map.data())), m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) @@ -941,10 +956,10 @@ struct mapbase_evaluator : evaluator_base internal::pstoret(m_data + index * m_innerStride.value(), x); } protected: - EIGEN_DEVICE_FUNC - inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC - inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } PointerType m_data; const internal::variable_if_dynamic m_innerStride; @@ -997,7 +1012,8 @@ struct evaluator > Alignment = evaluator >::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& ref) : mapbase_evaluator(ref) { } }; @@ -1052,7 +1068,8 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& block) : block_evaluator_type(block) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1065,7 +1082,8 @@ struct block_evaluator XprType; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) : unary_evaluator(block) {} }; @@ -1076,7 +1094,8 @@ struct unary_evaluator, IndexBa { typedef Block XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), m_startCol(block.startCol()), @@ -1176,7 +1195,8 @@ struct block_evaluator XprType; typedef typename XprType::Scalar Scalar; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) : mapbase_evaluator(block) { // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime @@ -1204,7 +1224,8 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) @@ -1261,7 +1282,8 @@ struct unary_evaluator > Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& replicate) : m_arg(replicate.nestedExpression()), m_argImpl(m_arg), m_rows(replicate.nestedExpression().rows()), @@ -1341,7 +1363,8 @@ struct evaluator_wrapper_base Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; @@ -1408,7 +1431,8 @@ struct unary_evaluator > { typedef MatrixWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1419,7 +1443,8 @@ struct unary_evaluator > { typedef ArrayWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1461,7 +1486,8 @@ struct unary_evaluator > Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f. }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& reverse) : m_argImpl(reverse.nestedExpression()), m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) @@ -1568,7 +1594,8 @@ struct evaluator > Alignment = 0 }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& diagonal) : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) { } diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index bf2632d9e..a20cd4710 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -100,8 +100,12 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) + //Required for Visual Studio or the Copy constructor will probably not get inlined! + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CwiseBinaryOp(const CwiseBinaryOp&) = default; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); @@ -110,16 +114,16 @@ class CwiseBinaryOp : eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { // return the fixed size type if available to enable compile time optimizations if (internal::traits::type>::RowsAtCompileTime==Dynamic) return m_rhs.rows(); else return m_lhs.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { // return the fixed size type if available to enable compile time optimizations if (internal::traits::type>::ColsAtCompileTime==Dynamic) return m_rhs.cols(); @@ -128,13 +132,13 @@ class CwiseBinaryOp : } /** \returns the left hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; } protected: diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 406e66013..2289fe41f 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -415,7 +415,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(const DenseBase& other) { EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); @@ -427,7 +427,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(PlainObjectBase& other) { eigen_assert(rows()==other.rows() && cols()==other.cols()); diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 5bfcd6db8..bf7ef54b5 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -396,8 +396,8 @@ template<> struct gemv_dense_selector */ template template -EIGEN_DEVICE_FUNC -inline const Product +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const Product MatrixBase::operator*(const MatrixBase &other) const { // A note regarding the function declaration: In MSVC, this function will sometimes @@ -439,8 +439,9 @@ MatrixBase::operator*(const MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product -EIGEN_DEVICE_FUNC MatrixBase::lazyProduct(const MatrixBase &other) const +MatrixBase::lazyProduct(const MatrixBase &other) const { enum { ProductIsValid = Derived::ColsAtCompileTime==Dynamic diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 32269ed2e..02c70803f 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -255,27 +255,27 @@ class Matrix * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix() : Base() + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix() : Base() { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } // FIXME is it still needed - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { other.swap(*this); @@ -287,16 +287,16 @@ class Matrix // This constructor is for both 1x1 matrices and dynamic vectors template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit Matrix(const T& x) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit Matrix(const T& x) { Base::_check_template_params(); Base::template _init1(x); } template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const T0& x, const T1& y) { Base::_check_template_params(); Base::template _init2(x, y); diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 2deaa5aab..bae186ecb 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -947,7 +947,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of same type it is enough to swap the data pointers. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase & other) { enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; @@ -958,7 +958,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \brief const version forwarded to DenseBase::swap */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase const & other) { Base::swap(other.derived()); } @@ -1092,7 +1092,7 @@ template struct matrix_swap_impl { EIGEN_DEVICE_FUNC - static inline void run(MatrixTypeA& a, MatrixTypeB& b) + static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 70790dbd4..13d5662df 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; - EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_rhs.cols(); } - EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } - EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const RhsNestedCleaned& rhs() const { return m_rhs; } protected: diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 246bca3e5..27796315d 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -582,7 +582,8 @@ struct product_evaluator, ProductTag, DenseShape, * which is why we don't set the LinearAccessBit. * TODO: this seems possible when the result is a vector */ - EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; @@ -590,6 +591,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const { PacketType res; @@ -601,6 +603,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; @@ -629,7 +632,8 @@ struct product_evaluator, LazyCoeffBasedProduc enum { Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : Base(BaseProduct(xpr.lhs(),xpr.rhs())) {} }; diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index e231a7d7d..2eef5abc5 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -359,7 +359,8 @@ class redux_evaluator : public internal::evaluator<_XprType> typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -375,11 +376,12 @@ public: InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index d70200918..180a4e5ad 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -30,12 +30,13 @@ public: typedef typename Base::DstXprType DstXprType; typedef swap_assign_op Functor; - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} template - void assignPacket(Index row, Index col) + EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { PacketType tmp = m_src.template packet(row,col); const_cast(m_src).template writePacket(row,col, m_dst.template packet(row,col)); @@ -43,7 +44,7 @@ public: } template - void assignPacket(Index index) + EIGEN_STRONG_INLINE void assignPacket(Index index) { PacketType tmp = m_src.template packet(index); const_cast(m_src).template writePacket(index, m_dst.template packet(index)); @@ -52,7 +53,7 @@ public: // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael) template - void assignPacketByOuterInner(Index outer, Index inner) + EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); Index col = Base::colIndexByOuterInner(outer, inner); diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 91a9ab1b9..c513f7f7c 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -61,25 +61,27 @@ template class Transpose typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC - explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} + explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_matrix.rows(); } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_reference::type& nestedExpression() { return m_matrix; } /** \internal */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -123,8 +125,10 @@ template class TransposeImpl EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) - EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return derived().nestedExpression().outerStride(); } typedef typename internal::conditional< internal::is_lvalue::value, @@ -132,18 +136,20 @@ template class TransposeImpl const Scalar >::type ScalarWithConstIfNotLvalue; - EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } - EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar* data() const { return derived().nestedExpression().data(); } // FIXME: shall we keep the const version of coeffRef? - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index rowId, Index colId) const { return derived().nestedExpression().coeffRef(colId, rowId); } - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const { return derived().nestedExpression().coeffRef(index); } @@ -169,7 +175,8 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -EIGEN_DEVICE_FUNC inline Transpose +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -181,7 +188,8 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -EIGEN_DEVICE_FUNC inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index 0ede5d58e..71c5b95ee 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -71,8 +71,8 @@ template class VectorBlock /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start, Index size) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start, Index size) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) @@ -82,8 +82,8 @@ template class VectorBlock /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 3a8001e8f..df88aa2c0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -866,7 +866,7 @@ // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -1047,7 +1047,7 @@ namespace Eigen { #endif #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1056,7 +1056,7 @@ namespace Eigen { } #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline friend \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ diff --git a/Eigen/src/Geometry/OrthoMethods.h b/Eigen/src/Geometry/OrthoMethods.h index a035e6310..524aebe1b 100644 --- a/Eigen/src/Geometry/OrthoMethods.h +++ b/Eigen/src/Geometry/OrthoMethods.h @@ -27,9 +27,10 @@ namespace Eigen { template template #ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_DEVICE_FUNC inline typename MatrixBase::template cross_product_return_type::type +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename MatrixBase::template cross_product_return_type::type #else -inline typename MatrixBase::PlainObject +typename MatrixBase::PlainObject #endif MatrixBase::cross(const MatrixBase& other) const { From 902a7793f79d7638d8de05a091682dddf34530d1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Feb 2019 16:52:34 +0100 Subject: [PATCH 222/295] Add possibility to bench row-major lhs and rhs --- bench/bench_gemm.cpp | 82 +++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 7c6dbea61..78ca1cd13 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -11,8 +11,9 @@ // #include -#include #include +#include + using namespace std; using namespace Eigen; @@ -30,10 +31,22 @@ using namespace Eigen; #define SCALARB SCALAR #endif +#ifdef ROWMAJ_A +const int opt_A = RowMajor; +#else +const int opt_A = ColMajor; +#endif + +#ifdef ROWMAJ_B +const int opt_B = RowMajor; +#else +const int opt_B = ColMajor; +#endif + typedef SCALAR Scalar; typedef NumTraits::Real RealScalar; -typedef Matrix A; -typedef Matrix B; +typedef Matrix A; +typedef Matrix B; typedef Matrix C; typedef Matrix M; @@ -58,45 +71,61 @@ static char lower = 'L'; static char right = 'R'; static int intone = 1; -void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) +#ifdef ROWMAJ_A +const char transA = trans; +#else +const char transA = notrans; +#endif + +#ifdef ROWMAJ_B +const char transB = trans; +#else +const char transB = notrans; +#endif + +template +void blas_gemm(const A& a, const B& b, MatrixXf& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - sgemm_(¬rans,¬rans,&M,&N,&K,&fone, + sgemm_(&transA,&transB,&M,&N,&K,&fone, const_cast(a.data()),&lda, const_cast(b.data()),&ldb,&fone, c.data(),&ldc); } -EIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +template +void blas_gemm(const A& a, const B& b, MatrixXd& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - dgemm_(¬rans,¬rans,&M,&N,&K,&done, + dgemm_(&transA,&transB,&M,&N,&K,&done, const_cast(a.data()),&lda, const_cast(b.data()),&ldb,&done, c.data(),&ldc); } -void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c) +template +void blas_gemm(const A& a, const B& b, MatrixXcf& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - cgemm_(¬rans,¬rans,&M,&N,&K,(float*)&cfone, + cgemm_(&transA,&transB,&M,&N,&K,(float*)&cfone, const_cast((const float*)a.data()),&lda, const_cast((const float*)b.data()),&ldb,(float*)&cfone, (float*)c.data(),&ldc); } -void blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c) +template +void blas_gemm(const A& a, const B& b, MatrixXcd& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - zgemm_(¬rans,¬rans,&M,&N,&K,(double*)&cdone, + zgemm_(&transA,&transB,&M,&N,&K,(double*)&cdone, const_cast((const double*)a.data()),&lda, const_cast((const double*)b.data()),&ldb,(double*)&cdone, (double*)c.data(),&ldc); @@ -127,6 +156,8 @@ void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci) ci.noalias() += ai * b; } + + template EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c) { @@ -180,8 +211,8 @@ int main(int argc, char ** argv) } else if(argv[i][1]=='t') { + tries = atoi(argv[++i]); ++i; - tries = atoi(argv[i++]); } else if(argv[i][1]=='p') { @@ -217,7 +248,7 @@ int main(int argc, char ** argv) std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n"; std::ptrdiff_t mc(m), nc(n), kc(p); internal::computeProductBlockingSizes(kc, mc, nc); - std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n"; + std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n"; C r = c; @@ -241,7 +272,7 @@ int main(int argc, char ** argv) blas_gemm(a,b,r); c.noalias() += a * b; if(!r.isApprox(c)) { - std::cout << (r - c).norm() << "\n"; + std::cout << (r - c).norm()/r.norm() << "\n"; std::cerr << "Warning, your product is crap!\n\n"; } #else @@ -250,7 +281,7 @@ int main(int argc, char ** argv) gemm(a,b,c); r.noalias() += a.cast() .lazyProduct( b.cast() ); if(!r.isApprox(c)) { - std::cout << (r - c).norm() << "\n"; + std::cout << (r - c).norm()/r.norm() << "\n"; std::cerr << "Warning, your product is crap!\n\n"; } } @@ -264,6 +295,9 @@ int main(int argc, char ** argv) std::cout << "blas real " << tblas.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n"; #endif + // warm start + if(b.norm()+a.norm()==123.554) std::cout << "\n"; + BenchTimer tmt; c = rc; BENCH(tmt, tries, rep, gemm(a,b,c)); @@ -286,11 +320,11 @@ int main(int argc, char ** argv) if(1.*m*n*p<30*30*30) { - BenchTimer tmt; - c = rc; - BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); - std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; - std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; } #ifdef DECOUPLED From ec032ac03b90dc6c58680a4dc858133e9a72fd1f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 16 Feb 2019 09:44:05 +0100 Subject: [PATCH 223/295] Guard C++11-style default constructor. Also, this is only needed for MSVC --- Eigen/src/Core/CwiseBinaryOp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index a20cd4710..8b8de8382 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -100,9 +100,11 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; +#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 //Required for Visual Studio or the Copy constructor will probably not get inlined! EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp&) = default; +#endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) From 512b74aaa19fa12a05774dd30205d2c97e8bdef9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 11:47:54 +0100 Subject: [PATCH 224/295] GEMM: catch all scalar-multiple variants when falling-back to a coeff-based product. Before only s*A*B was caught which was both inconsistent with GEMM, sub-optimal, and could even lead to compilation-errors (https://stackoverflow.com/questions/54738495). --- Eigen/src/Core/ProductEvaluators.h | 63 ++++++++++++++++++++---------- Eigen/src/Core/util/BlasUtil.h | 14 ++++++- test/product_notemporary.cpp | 38 ++++++++++++++++++ 3 files changed, 92 insertions(+), 23 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 27796315d..60b79b855 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -411,35 +411,56 @@ struct generic_product_impl call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } - // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: - // dst {,+,-}= s * (A.lazyProduct(B)) - // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. - // For them, this strategy is also faster than simply by-passing the heap allocation through - // stack allocation. - // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, - // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, - // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h - template + // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance: + // dst {,+,-}= (s1*A)*(B*s2) + // will be rewritten as: + // dst {,+,-}= (s1*s2) * (A.lazyProduct(B)) + // There are at least four benefits of doing so: + // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations. + // 2 - it is faster than simply by-passing the heap allocation through stack allocation. + // 3 - it makes this fallback consistent with the heavy GEMM routine. + // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. + // (see https://stackoverflow.com/questions/54738495) + // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently + // enabled only when falling back from the main GEMM. + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const CwiseBinaryOp, - const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) { - call_restricted_packet_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + enum { + HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor + }; + // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto + // this is important for real*complex_mat + Scalar actualAlpha = blas_traits::extractScalarFactor(lhs) + * blas_traits::extractScalarFactor(rhs); + eval_dynamic_impl(dst, + blas_traits::extract(lhs), + blas_traits::extract(rhs), + func, + actualAlpha, + typename conditional::type()); + + } - // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above - // overload more specialized. - template +protected: + + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& /* s == 1 */, false_type) { call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); } - - -// template -// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) -// { dst.noalias() += alpha * lhs.lazyProduct(rhs); } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type) + { + call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func); + } }; // This specialization enforces the use of a coefficient-based evaluation strategy diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index a32630ed7..bc0a01540 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -274,7 +274,8 @@ template struct blas_traits HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) && ( bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time::ret) == 1) - ) ? 1 : 0 + ) ? 1 : 0, + HasScalarFactor = false }; typedef typename conditional struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; @@ -317,6 +321,9 @@ template struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; typedef typename Base::ExtractType ExtractType; @@ -335,6 +342,9 @@ template struct blas_traits, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseUnaryOp, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; @@ -358,7 +368,7 @@ struct blas_traits > typename ExtractType::PlainObject >::type DirectLinearAccessType; enum { - IsTransposed = Base::IsTransposed ? 0 : 1 + IsTransposed = Base::IsTransposed ? 0 : 1, }; static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); } static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); } diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index dffb07608..7f169e6ae 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -11,6 +11,35 @@ #include "main.h" +template +void check_scalar_multiple3(Dst &dst, const Lhs& A, const Rhs& B) +{ + VERIFY_EVALUATION_COUNT( (dst.noalias() = A * B), 0); + VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() ); + VERIFY_EVALUATION_COUNT( (dst.noalias() += A * B), 0); + VERIFY_IS_APPROX( dst, 2*(A.eval() * B.eval()).eval() ); + VERIFY_EVALUATION_COUNT( (dst.noalias() -= A * B), 0); + VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() ); +} + +template +void check_scalar_multiple2(Dst &dst, const Lhs& A, const Rhs& B, S2 s2) +{ + CALL_SUBTEST( check_scalar_multiple3(dst, A, B) ); + CALL_SUBTEST( check_scalar_multiple3(dst, A, -B) ); + CALL_SUBTEST( check_scalar_multiple3(dst, A, s2*B) ); + CALL_SUBTEST( check_scalar_multiple3(dst, A, B*s2) ); +} + +template +void check_scalar_multiple1(Dst &dst, const Lhs& A, const Rhs& B, S1 s1, S2 s2) +{ + CALL_SUBTEST( check_scalar_multiple2(dst, A, B, s2) ); + CALL_SUBTEST( check_scalar_multiple2(dst, -A, B, s2) ); + CALL_SUBTEST( check_scalar_multiple2(dst, s1*A, B, s2) ); + CALL_SUBTEST( check_scalar_multiple2(dst, A*s1, B, s2) ); +} + template void product_notemporary(const MatrixType& m) { /* This test checks the number of temporaries created @@ -148,6 +177,15 @@ template void product_notemporary(const MatrixType& m) // Check nested products VERIFY_EVALUATION_COUNT( cvres.noalias() = m1.adjoint() * m1 * cv1, 1 ); VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 ); + + // exhaustively check all scalar multiple combinations: + { + // Generic path: + check_scalar_multiple1(m3, m1, m2, s1, s2); + // Force fall back to coeff-based: + typename ColMajorMatrixType::BlockXpr m3_blck = m3.block(r0,r0,1,1); + check_scalar_multiple1(m3_blck, m1.block(r0,c0,1,1), m2.block(c0,r0,1,1), s1, s2); + } } EIGEN_DECLARE_TEST(product_notemporary) From c69d0d08d0d71c779a245babe80342f0cf1ea985 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 14:43:07 +0100 Subject: [PATCH 225/295] Set cost of conjugate to 0 (in practice it boils down to a no-op). This is also important to make sure that A.conjugate() * B.conjugate() does not evaluate its arguments into temporaries (e.g., if A and B are fixed and small, or * fall back to lazyProduct) --- Eigen/src/Core/functors/UnaryFunctors.h | 10 +++++++++- test/product_notemporary.cpp | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 55994047e..1d5eb3678 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -117,7 +117,15 @@ template struct functor_traits > { enum { - Cost = NumTraits::IsComplex ? NumTraits::AddCost : 0, + Cost = 0, + // Yes the cost is zero even for complexes because in most cases for which + // the cost is used, conjugation turns to be a no-op. Some examples: + // cost(a*conj(b)) == cost(a*b) + // cost(a+conj(b)) == cost(a+b) + // ::HasConj }; }; diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 7f169e6ae..8b6419d0c 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -134,7 +134,9 @@ template void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( m3.noalias() = m1.block(r0,r0,r1,r1).template triangularView() * m2.block(r0,c0,r1,c1), 1); // Zero temporaries for lazy products ... + m3.setRandom(rows,cols); VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) / (m3.transpose().lazyProduct(m3)).diagonal().sum(), 0 ); + VERIFY_EVALUATION_COUNT( m3.noalias() = m1.conjugate().lazyProduct(m2.conjugate()), 0); // ... and even no temporary for even deeply (>=2) nested products VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) / (m3.transpose() * m3).diagonal().sum(), 0 ); From 31b6e080a9235f6275012ac69f0938b211444edf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 14:45:55 +0100 Subject: [PATCH 226/295] Fix regression: .conjugate() was popped out but not re-introduced. --- Eigen/src/Core/ProductEvaluators.h | 14 ++++++++------ test/product_notemporary.cpp | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 60b79b855..d53dc30a3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -430,28 +430,30 @@ struct generic_product_impl void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) { enum { - HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor + HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor, + ConjLhs = blas_traits::NeedToConjugate, + ConjRhs = blas_traits::NeedToConjugate }; // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto // this is important for real*complex_mat Scalar actualAlpha = blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); eval_dynamic_impl(dst, - blas_traits::extract(lhs), - blas_traits::extract(rhs), + blas_traits::extract(lhs).template conjugateIf(), + blas_traits::extract(rhs).template conjugateIf(), func, actualAlpha, typename conditional::type()); - - } protected: template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& /* s == 1 */, false_type) + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) { + EIGEN_UNUSED_VARIABLE(s); + eigen_internal_assert(s==Scalar(1)); call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); } diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 8b6419d0c..20cb7c080 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -29,6 +29,7 @@ void check_scalar_multiple2(Dst &dst, const Lhs& A, const Rhs& B, S2 s2) CALL_SUBTEST( check_scalar_multiple3(dst, A, -B) ); CALL_SUBTEST( check_scalar_multiple3(dst, A, s2*B) ); CALL_SUBTEST( check_scalar_multiple3(dst, A, B*s2) ); + CALL_SUBTEST( check_scalar_multiple3(dst, A, (B*s2).conjugate()) ); } template @@ -38,6 +39,7 @@ void check_scalar_multiple1(Dst &dst, const Lhs& A, const Rhs& B, S1 s1, S2 s2) CALL_SUBTEST( check_scalar_multiple2(dst, -A, B, s2) ); CALL_SUBTEST( check_scalar_multiple2(dst, s1*A, B, s2) ); CALL_SUBTEST( check_scalar_multiple2(dst, A*s1, B, s2) ); + CALL_SUBTEST( check_scalar_multiple2(dst, (A*s1).conjugate(), B, s2) ); } template void product_notemporary(const MatrixType& m) From 796db94e6e82548a7594c00b4ae83efbe76baffc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 16:21:27 +0100 Subject: [PATCH 227/295] bug #1194: implement slightly faster and SIMD friendly 4x4 determinant. --- Eigen/src/LU/Determinant.h | 43 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index 6af63a6e7..3a41e6fcb 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -23,15 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } -template -EIGEN_DEVICE_FUNC -const typename Derived::Scalar bruteforce_det4_helper -(const MatrixBase& matrix, int j, int k, int m, int n) -{ - return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) - * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); -} - template struct determinant_impl @@ -75,16 +66,34 @@ template struct determinant_impl template struct determinant_impl { + typedef typename traits::Scalar Scalar; static EIGEN_DEVICE_FUNC - typename traits::Scalar run(const Derived& m) + Scalar run(const Derived& m) { - // trick by Martin Costabel to compute 4x4 det with only 30 muls - return bruteforce_det4_helper(m,0,1,2,3) - - bruteforce_det4_helper(m,0,2,1,3) - + bruteforce_det4_helper(m,0,3,1,2) - + bruteforce_det4_helper(m,1,2,0,3) - - bruteforce_det4_helper(m,1,3,0,2) - + bruteforce_det4_helper(m,2,3,0,1); + Scalar d2_01 = det2(m, 0, 1); + Scalar d2_02 = det2(m, 0, 2); + Scalar d2_03 = det2(m, 0, 3); + Scalar d2_12 = det2(m, 1, 2); + Scalar d2_13 = det2(m, 1, 3); + Scalar d2_23 = det2(m, 2, 3); + Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); + Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); + Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); + Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); + return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + + internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + } +protected: + static EIGEN_DEVICE_FUNC + Scalar det2(const Derived& m, Index i0, Index i1) + { + return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); + } + + static EIGEN_DEVICE_FUNC + Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) + { + return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); } }; From e23bf40dc24132f74745ccf4439efcc770daadd3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 22:03:47 +0100 Subject: [PATCH 228/295] Add unit test for LinSpaced and complex numbers. --- test/nullary.cpp | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/test/nullary.cpp b/test/nullary.cpp index 1df3693d6..9b25ea4f3 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -70,7 +70,7 @@ void testVectorType(const VectorType& base) Scalar high = internal::random(-500,500); Scalar low = (size == 1 ? high : internal::random(-500,500)); - if (low>high) std::swap(low,high); + if (numext::real(low)>numext::real(high)) std::swap(low,high); // check low==high if(internal::random(0.f,1.f)<0.05f) @@ -79,7 +79,7 @@ void testVectorType(const VectorType& base) else if(size>2 && std::numeric_limits::max_exponent10>0 && internal::random(0.f,1.f)<0.1f) low = -internal::random(1,2) * RealScalar(std::pow(RealScalar(10),std::numeric_limits::max_exponent10/2)); - const Scalar step = ((size == 1) ? 1 : (high-low)/(size-1)); + const Scalar step = ((size == 1) ? 1 : (high-low)/RealScalar(size-1)); // check whether the result yields what we expect it to do VectorType m(base); @@ -89,21 +89,22 @@ void testVectorType(const VectorType& base) { VectorType n(size); for (int i=0; i::IsInteger) || ((high-low)>=size && (Index(high-low)%(size-1))==0) || (Index(high-low+1)::IsInteger) || (range_length>=size && (Index(range_length)%(size-1))==0) || (Index(range_length+1)::IsInteger) || (high-low>=size)) + if((!NumTraits::IsInteger) || (range_length>=size)) for (int i=0; i= low).all() ); + VERIFY( numext::real(m(m.size()-1)) <= numext::real(high) ); + VERIFY( (m.array().real() <= numext::real(high)).all() ); + VERIFY( (m.array().real() >= numext::real(low)).all() ); - VERIFY( m(m.size()-1) >= low ); + VERIFY( numext::real(m(m.size()-1)) >= numext::real(low) ); if(size>=1) { VERIFY( internal::isApprox(m(0),low) ); @@ -135,7 +136,7 @@ void testVectorType(const VectorType& base) col_vector.setLinSpaced(size,low,high); // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit // when computing the squared sum in isApprox, thus the 2x factor. - VERIFY( row_vector.isApprox(col_vector.transpose(), Scalar(2)*NumTraits::epsilon())); + VERIFY( row_vector.isApprox(col_vector.transpose(), RealScalar(2)*NumTraits::epsilon())); Matrix size_changer(size+50); size_changer.setLinSpaced(size,low,high); @@ -157,18 +158,18 @@ void testVectorType(const VectorType& base) { Index n0 = VectorType::SizeAtCompileTime==Dynamic ? 0 : VectorType::SizeAtCompileTime; low = internal::random(); - m = VectorType::LinSpaced(n0,low,low-1); + m = VectorType::LinSpaced(n0,low,low-RealScalar(1)); VERIFY(m.size()==n0); if(VectorType::SizeAtCompileTime==Dynamic) { VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,0,Scalar(n0-1)).sum(),Scalar(0)); - VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-1).sum(),Scalar(0)); + VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-RealScalar(1)).sum(),Scalar(0)); } m.setLinSpaced(n0,0,Scalar(n0-1)); VERIFY(m.size()==n0); - m.setLinSpaced(n0,low,low-1); + m.setLinSpaced(n0,low,low-RealScalar(1)); VERIFY(m.size()==n0); // empty range only: @@ -178,16 +179,16 @@ void testVectorType(const VectorType& base) if(NumTraits::IsInteger) { - VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+size-1)), VectorType::LinSpaced(size,Scalar(low+size-1),low).reverse() ); + VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar(size-1)), VectorType::LinSpaced(size,low+Scalar(size-1),low).reverse() ); if(VectorType::SizeAtCompileTime==Dynamic) { // Check negative multiplicator path: for(Index k=1; k<5; ++k) - VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+(size-1)*k)), VectorType::LinSpaced(size,Scalar(low+(size-1)*k),low).reverse() ); + VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar((size-1)*k)), VectorType::LinSpaced(size,low+Scalar((size-1)*k),low).reverse() ); // Check negative divisor path: for(Index k=1; k<5; ++k) - VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,Scalar(low+size-1)), VectorType::LinSpaced(size*k,Scalar(low+size-1),low).reverse() ); + VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,low+Scalar(size-1)), VectorType::LinSpaced(size*k,low+Scalar(size-1),low).reverse() ); } } } @@ -318,6 +319,7 @@ EIGEN_DECLARE_TEST(nullary) CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random(1,300),internal::random(1,300))) ); for(int i = 0; i < g_repeat*10; i++) { + CALL_SUBTEST_3( testVectorType(VectorXcd(internal::random(1,30000))) ); CALL_SUBTEST_4( testVectorType(VectorXd(internal::random(1,30000))) ); CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232 CALL_SUBTEST_6( testVectorType(Vector3d()) ); From 7580112c3168e6e436204ac4e584a86c3ab53498 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Feb 2019 22:12:28 +0100 Subject: [PATCH 229/295] Fix harmless Scalar vs RealScalar cast. --- Eigen/src/Core/functors/NullaryFunctors.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index f5888abf0..16b645f91 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -42,14 +42,15 @@ template struct linspaced_op_impl; template struct linspaced_op_impl { + typedef typename NumTraits::Real RealScalar; + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/RealScalar(num_steps-1)), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { - typedef typename NumTraits::Real RealScalar; if(m_flip) return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); else From 7d10c787386c8b789d219392028ad38d36d5f82a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 10:31:56 +0100 Subject: [PATCH 230/295] bug #1046: add unit tests for correct propagation of alignment through std::alignment_of --- test/dense_storage.cpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index 1150ec52b..7fa25859d 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -52,6 +52,32 @@ void dense_storage_assignment() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } +template +void dense_storage_alignment() +{ + #if EIGEN_HAS_ALIGNAS + + struct alignas(Alignment) Empty1 {}; + VERIFY_IS_EQUAL(std::alignment_of::value, Alignment); + + struct EIGEN_ALIGN_TO_BOUNDARY(Alignment) Empty2 {}; + VERIFY_IS_EQUAL(std::alignment_of::value, Alignment); + + struct Nested1 { EIGEN_ALIGN_TO_BOUNDARY(Alignment) T data[Size]; }; + VERIFY_IS_EQUAL(std::alignment_of::value, Alignment); + + VERIFY_IS_EQUAL( (std::alignment_of >::value), Alignment); + + const std::size_t default_alignment = internal::compute_default_alignment::value; + + VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); + VERIFY_IS_EQUAL( (std::alignment_of >::value), default_alignment); + struct Nested2 { Matrix mat; }; + VERIFY_IS_EQUAL(std::alignment_of::value, default_alignment); + + #endif +} + EIGEN_DECLARE_TEST(dense_storage) { dense_storage_copy(); @@ -72,5 +98,10 @@ EIGEN_DECLARE_TEST(dense_storage) dense_storage_assignment(); dense_storage_assignment(); dense_storage_assignment(); - dense_storage_assignment(); + dense_storage_assignment(); + + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); } From 115da6a1ea6d29fbe432af9486090a74bc7de9b8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 14:00:15 +0100 Subject: [PATCH 231/295] Fix conversion warnings --- Eigen/src/LU/PartialPivLU.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index cba7a0fcf..b8938013a 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -369,8 +369,8 @@ struct partial_lu_impl Index first_zero_pivot = -1; for(Index k = 0; k < endk; ++k) { - Index rrows = rows-k-1; - Index rcols = cols-k-1; + int rrows = internal::convert_index(rows-k-1); + int rcols = internal::convert_index(cols-k-1); Index row_of_biggest_in_col; Score biggest_in_corner From ab78cabd39a09dc8e30b1d522fae67ab90c6802d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 14:04:35 +0100 Subject: [PATCH 232/295] Add C++17 detection macro, and make sure throw(xpr) is not used if the compiler is in c++17 mode. --- Eigen/src/Core/util/Macros.h | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index df88aa2c0..fd3e43fa0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -508,13 +508,33 @@ #define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 #endif -#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) + +// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. +// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER +// is defined to 17. +#if (defined(__cplusplus) && (__cplusplus > 201402L) || EIGEN_COMP_MSVC >= 1914) +#define EIGEN_COMP_CXXVER 17 +#elif (defined(__cplusplus) && (__cplusplus > 201103L) || EIGEN_COMP_MSVC >= 1910) +#define EIGEN_COMP_CXXVER 14 +#elif (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) +#define EIGEN_COMP_CXXVER 11 +#else +#define EIGEN_COMP_CXXVER 03 +#endif + + +// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features +// but in practice we should not rely on them but rather on the availabilty of +// individual features as defined later. +// This is why there is no EIGEN_HAS_CXX17. +// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 #define EIGEN_HAS_CXX11 1 #else #define EIGEN_HAS_CXX11 0 #endif -#if EIGEN_MAX_CPP_VER>=14 && (defined(__cplusplus) && (__cplusplus > 201103L) || EIGEN_COMP_MSVC >= 1910) +#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 #define EIGEN_HAS_CXX14 1 #else #define EIGEN_HAS_CXX14 0 @@ -1105,9 +1125,9 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# if EIGEN_COMP_MSVC +# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 // MSVC does not support exception specifications (warning C4290), - // and they are deprecated in c++11 anyway. + // and they are deprecated in c++11 anyway. This is even an error in c++17. # define EIGEN_EXCEPTION_SPEC(X) throw() # else # define EIGEN_EXCEPTION_SPEC(X) throw(X) From 2cfc025bdaf36f3e4693a48db48ba2366b281cfc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 14:05:22 +0100 Subject: [PATCH 233/295] fix unit compilation in c++17: std::ptr_fun has been removed. --- test/indexed_view.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 6d6ef0cd4..5f1e01fc8 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -95,7 +95,11 @@ void check_indexed_view() ArrayXd a = ArrayXd::LinSpaced(n,0,n-1); Array b = a.transpose(); - ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ptr_fun(encode)); + #if EIGEN_COMP_CXXVER>=14 + ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ref(encode)); + #else + ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ptr_fun(&encode)); + #endif for(Index i=0; i Date: Tue, 19 Feb 2019 14:32:25 +0100 Subject: [PATCH 234/295] Commas at the end of enumerator lists are not allowed in C++03 --- Eigen/src/Core/util/BlasUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index bc0a01540..e6689c656 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -368,7 +368,7 @@ struct blas_traits > typename ExtractType::PlainObject >::type DirectLinearAccessType; enum { - IsTransposed = Base::IsTransposed ? 0 : 1, + IsTransposed = Base::IsTransposed ? 0 : 1 }; static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); } static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); } From 071629a4403b33072ee9121d762cf0f8ca7a593a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 19 Feb 2019 10:49:54 -0800 Subject: [PATCH 235/295] Fix incorrect value of NumDimensions in TensorContraction traits. Reported here: #1671 --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 61a4e1a3a..6ca881f27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -102,7 +102,7 @@ struct traits::type _RhsNested; // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; typedef typename conditional::val, typename traits::PointerType, typename traits::PointerType>::type PointerType; From 292d61970a025f08daac53a20f6ac8fc7c00574e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 21:59:41 +0100 Subject: [PATCH 236/295] Fix C++17 compilation --- test/sparse.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/sparse.h b/test/sparse.h index 9912e1e24..7d753fdba 100644 --- a/test/sparse.h +++ b/test/sparse.h @@ -14,7 +14,12 @@ #include "main.h" -#if EIGEN_GNUC_AT_LEAST(4,0) && !defined __ICC && !defined(__clang__) +#if EIGEN_HAS_CXX11 + +#include +#define EIGEN_UNORDERED_MAP_SUPPORT + +#elif EIGEN_GNUC_AT_LEAST(4,0) && !defined __ICC && !defined(__clang__) #ifdef min #undef min From 9ac1634fdff94bf18b534066eb0e3029ac182fe2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 21:59:53 +0100 Subject: [PATCH 237/295] Fix conversion warnings --- unsupported/Eigen/src/SparseExtra/RandomSetter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h index ee97299af..7542cf764 100644 --- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h +++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h @@ -249,10 +249,10 @@ class RandomSetter } } // prefix sum - Index count = 0; + StorageIndex count = 0; for (Index j=0; jouterSize(); ++j) { - Index tmp = positions[j]; + StorageIndex tmp = positions[j]; mp_target->outerIndexPtr()[j] = count; positions[j] = count; count += tmp; @@ -281,7 +281,7 @@ class RandomSetter mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i]; --i; } - mp_target->innerIndexPtr()[i+1] = inner; + mp_target->innerIndexPtr()[i+1] = internal::convert_index(inner); mp_target->valuePtr()[i+1] = it->second.value; } } From 482c5fb321695f7992d3bb718b7f64f2feaf61d5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 22:52:15 +0100 Subject: [PATCH 238/295] bug #899: remove "rank-revealing" qualifier for SparseQR and warn that it is not always rank-revealing. --- Eigen/src/SparseQR/SparseQR.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 1a28389e8..d1fb96f5c 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -41,15 +41,16 @@ namespace internal { /** * \ingroup SparseQR_Module * \class SparseQR - * \brief Sparse left-looking rank-revealing QR factorization + * \brief Sparse left-looking QR factorization with numerical column pivoting * - * This class implements a left-looking rank-revealing QR decomposition - * of sparse matrices. When a column has a norm less than a given tolerance + * This class implements a left-looking QR decomposition of sparse matrices + * with numerical column pivoting. + * When a column has a norm less than a given tolerance * it is implicitly permuted to the end. The QR factorization thus obtained is * given by A*P = Q*R where R is upper triangular or trapezoidal. * * P is the column permutation which is the product of the fill-reducing and the - * rank-revealing permutations. Use colsPermutation() to get it. + * numerical permutations. Use colsPermutation() to get it. * * Q is the orthogonal matrix represented as products of Householder reflectors. * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint. @@ -64,6 +65,17 @@ namespace internal { * * \implsparsesolverconcept * + * The numerical pivoting strategy and default threshold are the same as in SuiteSparse QR, and + * detailed in the following paper: + * + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011. + * + * Even though it is qualified as "rank-revealing", this strategy might fail for some + * rank deficient problems. When this class is used to solve linear or least-square problems + * it is thus strongly recommended to check the accuracy of the computed solution. If it + * failed, it usually helps to increase the threshold with setPivotThreshold. + * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * @@ -331,7 +343,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) m_R.resize(m, n); m_Q.resize(m, diagSize); - // Allocate space for nonzero elements : rough estimation + // Allocate space for nonzero elements: rough estimation m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree m_Q.reserve(2*mat.nonZeros()); m_hcoeffs.resize(diagSize); From 3b5deeb546d4017b24846f5b0dc3296a50a039fe Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 19 Feb 2019 22:57:51 +0100 Subject: [PATCH 239/295] bug #899: make sparseqr unit test more stable by 1) trying with larger threshold and 2) relax rank computation for rank-deficient problems. --- test/sparseqr.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp index 3ffe62314..3576cc626 100644 --- a/test/sparseqr.cpp +++ b/test/sparseqr.cpp @@ -43,6 +43,7 @@ int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows template void test_sparseqr_scalar() { + typedef typename NumTraits::Real RealScalar; typedef SparseMatrix MatrixType; typedef Matrix DenseMat; typedef Matrix DenseVector; @@ -91,14 +92,34 @@ template void test_sparseqr_scalar() exit(0); return; } - - VERIFY_IS_APPROX(A * x, b); - - //Compare with a dense QR solver + + // Compare with a dense QR solver ColPivHouseholderQR dqr(dA); refX = dqr.solve(b); - VERIFY_IS_EQUAL(dqr.rank(), solver.rank()); + bool rank_deficient = A.cols()>A.rows() || dqr.rank() we might have to increase the threshold + // to get a correct solution. + RealScalar th = RealScalar(20)*dA.colwise().norm().maxCoeff()*(A.rows()+A.cols()) * NumTraits::epsilon(); + for(Index k=0; (k<16) && !test_isApprox(A*x,b); ++k) + { + th *= RealScalar(10); + solver.setPivotThreshold(th); + solver.compute(A); + x = solver.solve(b); + } + } + + VERIFY_IS_APPROX(A * x, b); + + // For rank deficient problem, the estimated rank might + // be slightly off, so let's only raise a warning in such cases. + if(rank_deficient) ++g_test_level; + VERIFY_IS_EQUAL(solver.rank(), dqr.rank()); + if(rank_deficient) --g_test_level; + if(solver.rank()==A.cols()) // full rank VERIFY_IS_APPROX(x, refX); // else From edd413c184325eab482a82f68b4308eb2b4f4f9f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 13:52:11 +0100 Subject: [PATCH 240/295] bug #1409: make EIGEN_MAKE_ALIGNED_OPERATOR_NEW* macros empty in c++17 mode: - this helps clang 5 and 6 to support alignas in STL's containers. - this makes the public API of our (and users) classes cleaner --- Eigen/src/Core/util/Macros.h | 17 +++++++++++++++++ Eigen/src/Core/util/Memory.h | 13 +++++++++++++ test/dynalloc.cpp | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index fd3e43fa0..ffd6a00ca 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -698,6 +698,23 @@ #endif #endif +// NOTE: the required Apple's clang version is very conservative +// and it could be that XCode 9 works just fine. +// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support +// and not tested. +#ifndef EIGEN_HAS_CXX17_OVERALIGN +#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \ + (EIGEN_COMP_MSVC >= 1912) \ + || (EIGEN_GNUC_AT_LEAST(7,0)) \ + || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \ + || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \ + ) +#define EIGEN_HAS_CXX17_OVERALIGN 1 +#else +#define EIGEN_HAS_CXX17_OVERALIGN 0 +#endif +#endif + #if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules #if defined(__NVCC__) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a135761d6..efd747217 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -789,6 +789,17 @@ template void swap(scoped_array &a,scoped_array &b) *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] *** *****************************************************************************/ +#if EIGEN_HAS_CXX17_OVERALIGN + +// C++17 -> no need to bother about alignment anymore :) + +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) + +#else + #if EIGEN_MAX_ALIGN_BYTES!=0 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ @@ -831,6 +842,8 @@ template void swap(scoped_array &a,scoped_array &b) ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) +#endif + /****************************************************************************/ /** \class aligned_allocator diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index 1c74866ba..23c90a7b5 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -107,7 +107,7 @@ template void check_custom_new_delete() delete[] t; } -#if EIGEN_MAX_ALIGN_BYTES>0 +#if EIGEN_MAX_ALIGN_BYTES>0 && (!EIGEN_HAS_CXX17_OVERALIGN) { T* t = static_cast((T::operator new)(sizeof(T))); (T::operator delete)(t, sizeof(T)); From 844e5447f8dd28989a58cd33e357bb68bc175c2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 13:54:04 +0100 Subject: [PATCH 241/295] Update documentation regarding alignment issue. --- doc/StlContainers.dox | 41 ++++++++++------ doc/StructHavingEigenMembers.dox | 81 ++++++++++++++++++-------------- doc/UnalignedArrayAssert.dox | 40 +++++++++------- 3 files changed, 95 insertions(+), 67 deletions(-) diff --git a/doc/StlContainers.dox b/doc/StlContainers.dox index e0f8714a9..665a54793 100644 --- a/doc/StlContainers.dox +++ b/doc/StlContainers.dox @@ -6,31 +6,39 @@ namespace Eigen { \section StlContainers_summary Executive summary -Using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps: +If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading. -\li A 16-byte-aligned allocator must be used. Eigen does provide one ready for use: aligned_allocator. -\li If you want to use the std::vector container, you need to \#include . +Otherwise, using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires the use of an over-aligned allocator. +That is, an allocator capable of allocating buffers with 16, 32, or even 64 bytes alignment. +%Eigen does provide one ready for use: aligned_allocator. -These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". For other Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers. +Prior to \cpp11, if you want to use the `std::vector` container, then you also have to `#include `. + +These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". +For other %Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers. \section allocator Using an aligned allocator -STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned locations. Fortunately, Eigen does provide such an allocator: Eigen::aligned_allocator. +STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned (or more) locations. Fortunately, %Eigen does provide such an allocator: Eigen::aligned_allocator. For example, instead of \code -std::map +std::map \endcode you need to use \code -std::map, - Eigen::aligned_allocator > > +std::map, + Eigen::aligned_allocator > > \endcode -Note that the third parameter "std::less" is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type. +Note that the third parameter `std::less` is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type. \section StlContainers_vector The case of std::vector -The situation with std::vector was even worse (explanation below) so we had to specialize it for the Eigen::aligned_allocator type. In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include . +This section is for c++98/03 users only. \cpp11 (or above) users can stop reading here. + +So in c++98/03, the situation with `std::vector` is more complicated because of a bug in the standard (explanation below). +To workaround the issue, we had to specialize it for the Eigen::aligned_allocator type. +In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include . Here is an example: \code @@ -39,12 +47,16 @@ Here is an example: std::vector > \endcode +\b Explanation: The `resize()` method of `std::vector` takes a `value_type` argument (defaulting to `value_type()`). So with `std::vector`, some Eigen::Vector4d objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4d can be created at an unaligned location. +In order to avoid that, the only solution we saw was to specialize `std::vector` to make it work on a slight modification of, here, Eigen::Vector4d, that is able to deal properly with this situation. + + \subsection vector_spec An alternative - specializing std::vector for Eigen types As an alternative to the recommended approach described above, you have the option to specialize std::vector for Eigen types requiring alignment. -The advantage is that you won't need to declare std::vector all over with Eigen::allocator. One drawback on the other hand side is that -the specialization needs to be defined before all code pieces in which e.g. std::vector is used. Otherwise, without knowing the specialization -the compiler will compile that particular instance with the default std::allocator and you program is most likely to crash. +The advantage is that you won't need to declare std::vector all over with Eigen::aligned_allocator. One drawback on the other hand side is that +the specialization needs to be defined before all code pieces in which e.g. `std::vector` is used. Otherwise, without knowing the specialization +the compiler will compile that particular instance with the default `std::allocator` and you program is most likely to crash. Here is an example: \code @@ -54,8 +66,7 @@ EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix2d) std::vector \endcode -\b Explanation: The resize() method of std::vector takes a value_type argument (defaulting to value_type()). So with std::vector, some Eigen::Vector4f objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4f can be created at an unaligned location. In order to avoid that, the only solution we saw was to specialize std::vector to make it work on a slight modification of, here, Eigen::Vector4f, that is able to deal properly with this situation. - + */ diff --git a/doc/StructHavingEigenMembers.dox b/doc/StructHavingEigenMembers.dox index 7fbed0eb0..87016cdc9 100644 --- a/doc/StructHavingEigenMembers.dox +++ b/doc/StructHavingEigenMembers.dox @@ -6,7 +6,12 @@ namespace Eigen { \section StructHavingEigenMembers_summary Executive Summary -If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, %Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you. + +If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must ensure that calling operator new on it allocates properly aligned buffers. +If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading. + +Otherwise, you have to overload its `operator new` so that it generates properly aligned pointers (e.g., 32-bytes-aligned for Vector4d and AVX). +Fortunately, %Eigen provides you with a macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` that does that for you. \section StructHavingEigenMembers_what What kind of code needs to be changed? @@ -29,13 +34,13 @@ In other words: you have a class that has as a member a \ref TopicFixedSizeVecto \section StructHavingEigenMembers_how How should such code be modified? -Very easy, you just need to put a EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro in a public part of your class, like this: +Very easy, you just need to put a `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro in a public part of your class, like this: \code class Foo { ... - Eigen::Vector2d v; + Eigen::Vector4d v; ... public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW @@ -46,7 +51,9 @@ public: Foo *foo = new Foo; \endcode -This macro makes "new Foo" always return an aligned pointer. +This macro makes `new Foo` always return an aligned pointer. + +In \cpp17, this macro is empty. If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions". @@ -58,7 +65,7 @@ OK let's say that your code looks like this: class Foo { ... - Eigen::Vector2d v; + Eigen::Vector4d v; ... }; @@ -67,45 +74,59 @@ class Foo Foo *foo = new Foo; \endcode -A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that %Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault. +A Eigen::Vector4d consists of 4 doubles, which is 256 bits. +This is exactly the size of an AVX register, which makes it possible to use AVX for all sorts of operations on this vector. +But AVX instructions (at least the ones that %Eigen uses, which are the fast ones) require 256-bit alignment. +Otherwise you get a segmentation fault. -For this reason, Eigen takes care by itself to require 128-bit alignment for Eigen::Vector2d, by doing two things: -\li Eigen requires 128-bit alignment for the Eigen::Vector2d's array (of 2 doubles). With GCC, this is done with a __attribute__ ((aligned(16))). -\li Eigen overloads the "operator new" of Eigen::Vector2d so it will always return 128-bit aligned pointers. +For this reason, %Eigen takes care by itself to require 256-bit alignment for Eigen::Vector4d, by doing two things: +\li %Eigen requires 256-bit alignment for the Eigen::Vector4d's array (of 4 doubles). With \cpp11 this is done with the alignas keyword, or compiler's extensions for c++98/03. +\li %Eigen overloads the `operator new` of Eigen::Vector4d so it will always return 256-bit aligned pointers. (removed in \cpp17) -Thus, normally, you don't have to worry about anything, Eigen handles alignment for you... +Thus, normally, you don't have to worry about anything, %Eigen handles alignment of operator new for you... -... except in one case. When you have a class Foo like above, and you dynamically allocate a new Foo as above, then, since Foo doesn't have aligned "operator new", the returned pointer foo is not necessarily 128-bit aligned. +... except in one case. When you have a `class Foo` like above, and you dynamically allocate a new `Foo` as above, then, since `Foo` doesn't have aligned `operator new`, the returned pointer foo is not necessarily 256-bit aligned. -The alignment attribute of the member v is then relative to the start of the class, foo. If the foo pointer wasn't aligned, then foo->v won't be aligned either! +The alignment attribute of the member `v` is then relative to the start of the class `Foo`. If the `foo` pointer wasn't aligned, then `foo->v` won't be aligned either! -The solution is to let class Foo have an aligned "operator new", as we showed in the previous section. +The solution is to let `class Foo` have an aligned `operator new`, as we showed in the previous section. + +This explanation also holds for SSE/NEON/MSA/Altivec/VSX targets, which require 16-bytes alignment, and AVX512 which requires 64-bytes alignment for fixed-size objects multiple of 64 bytes (e.g., Eigen::Matrix4d). \section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class? -That's not required. Since Eigen takes care of declaring 128-bit alignment, all members that need it are automatically 128-bit aligned relatively to the class. So code like this works fine: +That's not required. Since %Eigen takes care of declaring adequate alignment, all members that need it are automatically aligned relatively to the class. So code like this works fine: \code class Foo { double x; - Eigen::Vector2d v; + Eigen::Vector4d v; public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW }; \endcode +That said, as usual, it is recommended to sort the members so that alignment does not waste memory. +In the above example, with AVX, the compiler will have to reserve 24 empty bytes between `x` and `v`. + + \section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors? Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable matrices and vectors". + \section StructHavingEigenMembers_bugineigen So is this a bug in Eigen? -No, it's not our bug. It's more like an inherent problem of the C++98 language specification, and seems to be taken care of in the upcoming language revision: see this document. +No, it's not our bug. It's more like an inherent problem of the c++ language specification that has been solved in c++17 through the feature known as dynamic memory allocation for over-aligned data. -\section StructHavingEigenMembers_conditional What if I want to do this conditionnally (depending on template parameters) ? -For this situation, we offer the macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign). It will generate aligned operators like EIGEN_MAKE_ALIGNED_OPERATOR_NEW if NeedsToAlign is true. It will generate operators with the default alignment if NeedsToAlign is false. +\section StructHavingEigenMembers_conditional What if I want to do this conditionally (depending on template parameters) ? + +For this situation, we offer the macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)`. +It will generate aligned operators like `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` if `NeedsToAlign` is true. +It will generate operators with the default alignment if `NeedsToAlign` is false. +In \cpp17, this macro is empty. Example: @@ -130,7 +151,7 @@ Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarant \section StructHavingEigenMembers_othersolutions Other solutions -In case putting the EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro everywhere is too intrusive, there exists at least two other solutions. +In case putting the `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro everywhere is too intrusive, there exists at least two other solutions. \subsection othersolutions1 Disabling alignment @@ -139,22 +160,13 @@ The first is to disable alignment requirement for the fixed size members: class Foo { ... - Eigen::Matrix v; + Eigen::Matrix v; ... }; \endcode -This has for effect to disable vectorization when using \c v. -If a function of Foo uses it several times, then it still possible to re-enable vectorization by copying it into an aligned temporary vector: -\code -void Foo::bar() -{ - Eigen::Vector2d av(v); - // use av instead of v - ... - // if av changed, then do: - v = av; -} -\endcode +This `v` is fully compatible with aligned Eigen::Vector4d. +This has only for effect to make load/stores to `v` more expensive (usually slightly, but that's hardware dependent). + \subsection othersolutions2 Private structure @@ -164,7 +176,7 @@ The second consist in storing the fixed-size objects into a private struct which struct Foo_d { EIGEN_MAKE_ALIGNED_OPERATOR_NEW - Vector2d v; + Vector4d v; ... }; @@ -183,7 +195,8 @@ private: }; \endcode -The clear advantage here is that the class Foo remains unchanged regarding alignment issues. The drawback is that a heap allocation will be required whatsoever. +The clear advantage here is that the class `Foo` remains unchanged regarding alignment issues. +The drawback is that an additional heap allocation will be required whatsoever. */ diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 8676faa1b..410c8a58f 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -12,7 +12,9 @@ is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArr **** READ THIS WEB PAGE !!! ****"' failed. -There are 4 known causes for this issue. Please read on to understand them and learn how to fix them. +There are 4 known causes for this issue. +If you can target \cpp17 only with a recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then you're lucky: enabling c++17 should be enough (if not, please report to us). +Otherwise, please read on to understand those issues and learn how to fix them. \eigenAutoToc @@ -35,7 +37,7 @@ If you have code like this, class Foo { //... - Eigen::Vector2d v; + Eigen::Vector4d v; //... }; //... @@ -44,27 +46,27 @@ Foo *foo = new Foo; then you need to read this separate page: \ref TopicStructHavingEigenMembers "Structures Having Eigen Members". -Note that here, Eigen::Vector2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types". +Note that here, Eigen::Vector4d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types". \section c2 Cause 2: STL Containers or manual memory allocation If you use STL Containers such as std::vector, std::map, ..., with %Eigen objects, or with classes containing %Eigen objects, like this, \code -std::vector my_vector; -struct my_class { ... Eigen::Matrix2f m; ... }; +std::vector my_vector; +struct my_class { ... Eigen::Matrix2d m; ... }; std::map my_map; \endcode then you need to read this separate page: \ref TopicStlContainers "Using STL Containers with Eigen". -Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". +Note that here, Eigen::Matrix2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". -The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c std::make_shared or \c std::allocate_shared for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers". +The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c `std::make_shared` or `std::allocate_shared` for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers". \section c3 Cause 3: Passing Eigen objects by value -If some function in your code is getting an Eigen object passed by value, like this, +If some function in your code is getting an %Eigen object passed by value, like this, \code void func(Eigen::Vector4d v); @@ -90,11 +92,13 @@ then you need to read this separate page: \ref TopicWrongStackAlignment "Compile Note that here, Eigen::Quaternionf is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types". + \section explanation General explanation of this assertion -\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions addressing them will crash. +\ref TopicFixedSizeVectorizable "Fixed-size vectorizable Eigen objects" must absolutely be created at properly aligned locations, otherwise SIMD instructions addressing them will crash. +For instance, SSE/NEON/MSA/Altivec/VSX targets will require 16-byte-alignment, whereas AVX and AVX512 targets may require up to 32 and 64 byte alignment respectively. -Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their "operator new". +%Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their `operator new`. However there are a few corner cases where these alignment settings get overridden: they are the possible causes for this assertion. @@ -102,22 +106,22 @@ However there are a few corner cases where these alignment settings get overridd Three possibilities:
    -
  • Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way Eigen won't try to align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.
  • -
  • Define \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY \endlink. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of +
  • Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way %Eigen won't try to over-align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.
  • +
  • Define \link TopicPreprocessorDirectivesPerformance EIGEN_MAX_STATIC_ALIGN_BYTES \endlink to 0. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of vectorizing fixed-size objects (like Matrix4d) through unaligned stores (as controlled by \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink), while keeping unchanged the vectorization of dynamic-size objects - (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of static alignment.
  • -
  • Or define both \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_VECTORIZE \endlink and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the - 16-byte alignment code and thus preserves ABI compatibility, but completely disables vectorization.
  • + (like MatrixXd). On 64 bytes systems, you might also define it 16 to disable only 32 and 64 bytes of over-alignment. But do note that this breaks ABI compatibility with the default behavior of static alignment. +
  • Or define both \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_VECTORIZE \endlink and `EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT`. This keeps the + 16-byte (or above) alignment code and thus preserves ABI compatibility, but completely disables vectorization.
-If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 16-byte alignment and the assertion, here's the explanation: +If you want to know why defining `EIGEN_DONT_VECTORIZE` does not by itself disable 16-byte (or above) alignment and the assertion, here's the explanation: It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization. -It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path. +It doesn't disable 16-byte (or above) alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path. \section checkmycode How can I check my code is safe regarding alignment issues? -Unfortunately, there is no possibility in C++ to detect any of the aforementioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). +Unfortunately, there is no possibility in c++ to detect any of the aforementioned shortcoming at compile time (though static analyzers are becoming more and more powerful and could detect some of them). Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page. Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default. From 4e8047cdcf9a527fa1a0178c370e600eefbcb8b7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 13:59:34 +0100 Subject: [PATCH 242/295] Fix compilation with gcc and remove TR1 stuff. --- test/sparse.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/test/sparse.h b/test/sparse.h index 7d753fdba..df471b4e2 100644 --- a/test/sparse.h +++ b/test/sparse.h @@ -16,11 +16,6 @@ #if EIGEN_HAS_CXX11 -#include -#define EIGEN_UNORDERED_MAP_SUPPORT - -#elif EIGEN_GNUC_AT_LEAST(4,0) && !defined __ICC && !defined(__clang__) - #ifdef min #undef min #endif @@ -29,11 +24,9 @@ #undef max #endif -#include +#include #define EIGEN_UNORDERED_MAP_SUPPORT -namespace std { - using std::tr1::unordered_map; -} + #endif #ifdef EIGEN_GOOGLEHASH_SUPPORT From 44b54fa4a3618922e06abdca9c555b8697698237 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 14:43:05 +0100 Subject: [PATCH 243/295] Protect c++11 type alias with Eigen's macro, and add respective unit test. --- Eigen/src/Core/Array.h | 4 ++-- Eigen/src/Core/Matrix.h | 4 ++-- test/CMakeLists.txt | 1 + test/type_alias.cpp | 43 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 test/type_alias.cpp diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 039f41a3d..a85d5084b 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -350,7 +350,7 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS -#if __cplusplus>=201103L +#if EIGEN_HAS_CXX11 #define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ /** \ingroup matrixtypedefs */ \ @@ -379,7 +379,7 @@ EIGEN_MAKE_FIXED_TYPEDEFS(4) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#endif // __cplusplus>=201103L +#endif // EIGEN_HAS_CXX11 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index ea5f7da78..83cffc8e6 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -499,7 +499,7 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#if __cplusplus>=201103L +#if EIGEN_HAS_CXX11 #define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ /** \ingroup matrixtypedefs */ \ @@ -531,7 +531,7 @@ EIGEN_MAKE_FIXED_TYPEDEFS(4) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#endif // __cplusplus>=201103L +#endif // EIGEN_HAS_CXX11 } // end namespace Eigen diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3dbb426eb..f74e22c28 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -164,6 +164,7 @@ ei_add_test(sizeof) ei_add_test(dynalloc) ei_add_test(nomalloc) ei_add_test(first_aligned) +ei_add_test(type_alias) ei_add_test(nullary) ei_add_test(mixingtypes) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") diff --git a/test/type_alias.cpp b/test/type_alias.cpp new file mode 100644 index 000000000..f9b0efc5d --- /dev/null +++ b/test/type_alias.cpp @@ -0,0 +1,43 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +EIGEN_DECLARE_TEST(type_alias) +{ + using namespace internal; + + // To warm up, some basic checks: + STATIC_CHECK((is_same >::value)); + STATIC_CHECK((is_same >::value)); + STATIC_CHECK((is_same >::value)); + +#if EIGEN_HAS_CXX11 + + STATIC_CHECK((is_same, MatrixXd>::value)); + STATIC_CHECK((is_same, MatrixXi>::value)); + STATIC_CHECK((is_same, Matrix2i>::value)); + STATIC_CHECK((is_same, Matrix2Xf>::value)); + STATIC_CHECK((is_same, MatrixX4d>::value)); + STATIC_CHECK((is_same, VectorXi>::value)); + STATIC_CHECK((is_same, Vector2f>::value)); + STATIC_CHECK((is_same, RowVectorXi>::value)); + STATIC_CHECK((is_same, RowVector2f>::value)); + + STATIC_CHECK((is_same, ArrayXXf>::value)); + STATIC_CHECK((is_same, Array33i>::value)); + STATIC_CHECK((is_same, Array2Xf>::value)); + STATIC_CHECK((is_same, ArrayX4d>::value)); + STATIC_CHECK((is_same, ArrayXd>::value)); + STATIC_CHECK((is_same, Array4d>::value)); + +#else + std::cerr << "WARNING: c++11 type aliases not tested.\n"; +#endif +} From 475295b5ff0c030fbed3127e2deaee77747d9833 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 15:18:07 +0100 Subject: [PATCH 244/295] Enable documentation of Array's typedefs --- doc/Doxyfile.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 5671986b1..72120f1f1 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1612,6 +1612,9 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EXPAND_AS_DEFINED = EIGEN_MAKE_TYPEDEFS \ EIGEN_MAKE_FIXED_TYPEDEFS \ EIGEN_MAKE_TYPEDEFS_ALL_SIZES \ + EIGEN_MAKE_ARRAY_TYPEDEFS \ + EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS \ + EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES \ EIGEN_CWISE_UNOP_RETURN_TYPE \ EIGEN_CWISE_BINOP_RETURN_TYPE \ EIGEN_CURRENT_STORAGE_BASE_CLASS \ From 302377110a26d88a9e0bad8322324227eda2e1c4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 15:18:48 +0100 Subject: [PATCH 245/295] Update documentation of Matrix and Array type aliases. --- Eigen/src/Core/Array.h | 43 ++++++++++++++++++++++++++--------------- Eigen/src/Core/Matrix.h | 14 +++++++++++++- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index a85d5084b..ee12d96fc 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -304,7 +304,7 @@ class Array /** \defgroup arraytypedefs Global array typedefs * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common 1D and 2D array types. + * %Eigen defines several typedef shortcuts for most common 1D and 2D array types. * * The general patterns are the following: * @@ -317,6 +317,12 @@ class Array * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is * a fixed-size 1D array of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `ArrayRowsCols` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size. + * - `ArraySize` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays. + * * \sa class Array */ @@ -349,35 +355,40 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS #if EIGEN_HAS_CXX11 -#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ -/** \ingroup matrixtypedefs */ \ +#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ template \ using Array##SizeSuffix##SizeSuffix = Array; \ -/** \ingroup matrixtypedefs */ \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ template \ using Array##SizeSuffix = Array; -#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ -/** \ingroup matrixtypedefs */ \ +#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ template \ using Array##Size##X = Array; \ -/** \ingroup matrixtypedefs */ \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ template \ using Array##X##Size = Array; -EIGEN_MAKE_TYPEDEFS(2, 2) -EIGEN_MAKE_TYPEDEFS(3, 3) -EIGEN_MAKE_TYPEDEFS(4, 4) -EIGEN_MAKE_TYPEDEFS(Dynamic, X) -EIGEN_MAKE_FIXED_TYPEDEFS(2) -EIGEN_MAKE_FIXED_TYPEDEFS(3) -EIGEN_MAKE_FIXED_TYPEDEFS(4) +EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2) +EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3) +EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4) +EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) -#undef EIGEN_MAKE_TYPEDEFS -#undef EIGEN_MAKE_FIXED_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS #endif // EIGEN_HAS_CXX11 diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 83cffc8e6..e7df4a901 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -450,7 +450,7 @@ class Matrix * * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common matrix and vector types. + * %Eigen defines several typedef shortcuts for most common matrix and vector types. * * The general patterns are the following: * @@ -462,6 +462,13 @@ class Matrix * * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is * a fixed-size vector of 4 complex floats. + * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. + * - `MatrixXSize`and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. + * - `VectorSize` and `RowVectorSize` for column and row vectors. * * \sa class Matrix */ @@ -503,20 +510,25 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ /** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ template \ using Matrix##SizeSuffix = Matrix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ template \ using Vector##SizeSuffix = Matrix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ template \ using RowVector##SizeSuffix = Matrix; #define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ /** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ template \ using Matrix##Size##X = Matrix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ template \ using Matrix##X##Size = Matrix; From 2a39659d793fcde656593bbf01948bc0bd568181 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Feb 2019 15:23:23 +0100 Subject: [PATCH 246/295] Add fully generic Vector and RowVector type aliases. --- Eigen/src/Core/Matrix.h | 14 +++++++++++++- test/type_alias.cpp | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index e7df4a901..4b714328c 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -467,8 +467,10 @@ class Matrix * They follow the same pattern as above except that the scalar type suffix is replaced by a * template parameter, i.e.: * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. - * - `MatrixXSize`and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. + * - `MatrixXSize` and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. * - `VectorSize` and `RowVectorSize` for column and row vectors. + * + * With \cpp11, you can also use fully generic column and row vector types: `Vector` and `RowVector`. * * \sa class Matrix */ @@ -540,6 +542,16 @@ EIGEN_MAKE_FIXED_TYPEDEFS(2) EIGEN_MAKE_FIXED_TYPEDEFS(3) EIGEN_MAKE_FIXED_TYPEDEFS(4) +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using Vector = Matrix; + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using RowVector = Matrix; + #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS diff --git a/test/type_alias.cpp b/test/type_alias.cpp index f9b0efc5d..9a6616c72 100644 --- a/test/type_alias.cpp +++ b/test/type_alias.cpp @@ -29,7 +29,7 @@ EIGEN_DECLARE_TEST(type_alias) STATIC_CHECK((is_same, Vector2f>::value)); STATIC_CHECK((is_same, RowVectorXi>::value)); STATIC_CHECK((is_same, RowVector2f>::value)); - + STATIC_CHECK((is_same, ArrayXXf>::value)); STATIC_CHECK((is_same, Array33i>::value)); STATIC_CHECK((is_same, Array2Xf>::value)); @@ -37,6 +37,11 @@ EIGEN_DECLARE_TEST(type_alias) STATIC_CHECK((is_same, ArrayXd>::value)); STATIC_CHECK((is_same, Array4d>::value)); + STATIC_CHECK((is_same, Vector3f>::value)); + STATIC_CHECK((is_same, VectorXi>::value)); + STATIC_CHECK((is_same, RowVector3f>::value)); + STATIC_CHECK((is_same, RowVectorXi>::value)); + #else std::cerr << "WARNING: c++11 type aliases not tested.\n"; #endif From 4d7f31710299fd869def962f2070c252ae1aaa67 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 21 Feb 2019 13:32:13 -0800 Subject: [PATCH 247/295] Add a few missing packet ops: cmp_eq for NEON. pfloor for GPU. --- Eigen/src/Core/arch/GPU/PacketMath.h | 9 +++++++++ Eigen/src/Core/arch/NEON/Complex.h | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index c1b097fb9..cd4615a45 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -53,6 +53,7 @@ template<> struct packet_traits : default_packet_traits HasBetaInc = 1, HasBlend = 0, + HasFloor = 1, }; }; @@ -86,6 +87,7 @@ template<> struct packet_traits : default_packet_traits HasBetaInc = 1, HasBlend = 0, + HasFloor = 1, }; }; @@ -408,6 +410,13 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } +template<> EIGEN_DEVICE_FUNC inline float4 pfloor(const float4& a) { + return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pfloor(const double2& a) { + return make_double2(floor(a.x), floor(a.y)); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float tmp = kernel.packet[0].y; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index d149275b5..e9da4a3f6 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -101,6 +101,18 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] + Packet4f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] + Packet4f eq_swapped = vrev64q_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet2cf(pand(eq, eq_swapped)); +} + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); @@ -361,6 +373,18 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = vrev64q_u32(eq); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(pand(eq, eq_swapped)); +} + template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); From 42c23f14acbfc9fbc00db7e34fcd39de60dfe4e2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Feb 2019 22:44:40 +0100 Subject: [PATCH 248/295] Speed up col/row-wise reverse for fixed size matrices by propagating compile-time sizes. --- Eigen/src/Core/Reverse.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 711dbcf9a..853093923 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -171,8 +171,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); + xpr.topRows(fix(half)) + .swap(xpr.bottomRows(fix(half)).colwise().reverse()); } }; @@ -182,8 +184,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); + xpr.leftCols(fix(half)) + .swap(xpr.rightCols(fix(half)).rowwise().reverse()); } }; From 32502f3c45a3d1753b87d3247989dad39cf131dd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Feb 2019 10:29:06 +0100 Subject: [PATCH 249/295] bug #1684: add simplified regression test for respective clang's bug (this also reveal the same bug in Apples's clang) --- test/array_reverse.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index e23159def..b19a6b356 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -132,6 +132,28 @@ void array_reverse_extra() VERIFY(x.reverse() == y); } +// Simpler version of reverseInPlace leveraging a bug +// in clang 6/7 with -O2 and AVX or AVX512 enabled. +// This simpler version ensure that the clang bug is not hidden +// through mis-inlining of reverseInPlace or other minor changes. +template +EIGEN_DONT_INLINE +void bug1684_work(MatrixType& m1, MatrixType& m2) +{ + m2 = m1; + m2.col(0).swap(m2.col(3)); + m2.col(1).swap(m2.col(2)); +} + +template +void bug1684() +{ + Matrix4f m1 = Matrix4f::Random(); + Matrix4f m2 = Matrix4f::Random(); + bug1684_work(m1,m2); + VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval()); +} + EIGEN_DECLARE_TEST(array_reverse) { for(int i = 0; i < g_repeat; i++) { @@ -144,6 +166,7 @@ EIGEN_DECLARE_TEST(array_reverse) CALL_SUBTEST_7( reverse(MatrixXcd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( reverse(Matrix()) ); CALL_SUBTEST_9( reverse(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_3( bug1684<0>() ); } CALL_SUBTEST_3( array_reverse_extra<0>() ); } From 7e3084bb6f9a1dd404289d5cac2d5d9bb7ffda47 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Feb 2019 14:56:12 +0100 Subject: [PATCH 250/295] Fix compilation on ARM. --- Eigen/src/Core/arch/NEON/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index e9da4a3f6..f6c5c211c 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -380,7 +380,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packe Packet2d eq = pcmp_eq(a.v, b.v); // Swap real/imag elements in the mask in to get: // [im(a)==im(b), re(a)==re(b)] - Packet2d eq_swapped = vrev64q_u32(eq); + Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq))); // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped return Packet1cd(pand(eq, eq_swapped)); } From 1c09ee8541501c37eae05cebae36b417f5f1650a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Feb 2019 15:48:53 +0100 Subject: [PATCH 251/295] bug #1674: workaround clang fast-math aggressive optimizations --- .../Core/arch/Default/GenericPacketMathFunctions.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 693dd55ea..452b4c806 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -324,6 +324,14 @@ __attribute__((optimize("-fno-unsafe-math-optimizations"))) #endif Packet psincos_float(const Packet& _x) { +// Workaround -ffast-math aggressive optimizations +// See bug 1674 +#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE) +#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X)); +#else +#define EIGEN_SINCOS_DONT_OPT(X) +#endif + typedef typename unpacket_traits::integer_packet PacketI; const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI @@ -338,6 +346,7 @@ Packet psincos_float(const Packet& _x) // Rounding trick: Packet y_round = padd(y, cst_rounding_magic); + EIGEN_SINCOS_DONT_OPT(y_round) PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi @@ -359,7 +368,9 @@ Packet psincos_float(const Packet& _x) // and 2 ULP up to: const float huge_th = ComputeSine ? 25966.f : 18838.f; x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + EIGEN_SINCOS_DONT_OPT(x) x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + EIGEN_SINCOS_DONT_OPT(x) x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee @@ -436,6 +447,8 @@ Packet psincos_float(const Packet& _x) // Update the sign and filter huge inputs return pxor(y, sign_bit); + +#undef EIGEN_SINCOS_DONT_OPT } template From cca6c207f42e8706ee581bd67b091e55327cbaca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Feb 2019 17:18:28 +0100 Subject: [PATCH 252/295] AVX512: implement faster ploadquad thus speeding up GEMM --- Eigen/src/Core/arch/AVX512/PacketMath.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 3842f576b..c111fd7f0 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -526,13 +526,11 @@ EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_undefined_ps(); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); - return tmp; + Packet16f tmp = _mm512_castps128_ps512(pload(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_ps(scatter_mask, tmp); } + // Loads 2 doubles from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1} template <> From 0cb4ba98e705a81e8d1ff1455ee447670e2ed72f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 21 Feb 2019 17:19:36 +0100 Subject: [PATCH 253/295] update wrt recent changes --- bench/perf_monitoring/changesets.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bench/perf_monitoring/changesets.txt b/bench/perf_monitoring/changesets.txt index c6b364593..647825c0f 100644 --- a/bench/perf_monitoring/changesets.txt +++ b/bench/perf_monitoring/changesets.txt @@ -90,4 +90,5 @@ before-evaluators 11683:2ea2960f1c7f # Make code compile again for older compilers. 11753:556fb4ceb654 # Bug: 1633: refactor gebp kernel and optimize for neon 11761:cefc1ba05596 # Bug 1661: fix regression in GEBP and AVX512 -11763:1e41e70fe97b # GEBP: cleanup logic to choose between
a 4 packets of 1 packet (=209bf81aa3f3+fix) \ No newline at end of file +11763:1e41e70fe97b # GEBP: cleanup logic to choose between
a 4 packets of 1 packet (=209bf81aa3f3+fix) +11803:d95b5d78598b # gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs \ No newline at end of file From 1dc1677d525b1df3cc32b3df5cd816e150b07185 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 22 Feb 2019 12:33:57 -0800 Subject: [PATCH 254/295] Change licensing of OrderingMethods/Amd.h and SparseCholesky/SimplicialCholesky_impl.h from LGPL to MPL2. Google LLC executed a license agreement with the author of the code from which these files are derived to allow the Eigen project to distribute the code and derived works under MPL2. --- Eigen/src/OrderingMethods/Amd.h | 24 ++++-------- .../SparseCholesky/SimplicialCholesky_impl.h | 39 ++++--------------- 2 files changed, 14 insertions(+), 49 deletions(-) diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index f91ecb24e..7ca3f33b1 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -2,32 +2,22 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. http://www.suitesparse.com -CSparse is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -CSparse is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this Module; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - +The author of CSparse, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SPARSE_AMD_H #define EIGEN_SPARSE_AMD_H diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 0aa92f8bc..7275db2cc 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -2,46 +2,21 @@ // for linear algebra. // // Copyright (C) 2008-2012 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - NOTE: these functions have been adapted from the LDL library: LDL Copyright (c) 2005 by Timothy A. Davis. All Rights Reserved. -LDL License: - - Your use or distribution of LDL or any modified version of - LDL implies that you agree to this License. - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - USA - - Permission is hereby granted to use or copy this program under the - terms of the GNU LGPL, provided that the Copyright, this License, - and the Availability of the original version is retained on all copies. - User documentation of any code that uses this code or any modified - version of this code must cite the Copyright, this License, the - Availability note, and "Used by permission." Permission to modify - the code and to distribute modified code is granted, provided the - Copyright, this License, and the Availability note are retained, - and a notice that the code was modified is included. +The author of LDL, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H From 0b25a5c431f2764cd46a04f07536d60256ecd256 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Feb 2019 21:39:36 +0100 Subject: [PATCH 255/295] fix alignment in ploadquad --- Eigen/src/Core/arch/AVX512/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index c111fd7f0..60b723b08 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -526,7 +526,7 @@ EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_castps128_ps512(pload(from)); + Packet16f tmp = _mm512_castps128_ps512(ploadu(from)); const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); return _mm512_permutexvar_ps(scatter_mask, tmp); } From 6560692c670bcf34fc922474bf37f3c18b8768af Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 22 Feb 2019 13:56:26 -0800 Subject: [PATCH 256/295] Improve EventCount used by the non-blocking threadpool. The current algorithm requires threads to commit/cancel waiting in order they called Prewait. Spinning caused by that serialization can consume lots of CPU time on some workloads. Restructure the algorithm to not require that serialization and remove spin waits from Commit/CancelWait. Note: this reduces max number of threads from 2^16 to 2^14 to leave more space for ABA counter (which is now 22 bits). Implementation details are explained in comments. --- .../Eigen/CXX11/src/ThreadPool/EventCount.h | 187 ++++++++++-------- .../src/ThreadPool/NonBlockingThreadPool.h | 6 +- unsupported/test/cxx11_eventcount.cpp | 10 +- 3 files changed, 110 insertions(+), 93 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 7a9ebe40a..8b3b210b1 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -20,7 +20,8 @@ namespace Eigen { // if (predicate) // return act(); // EventCount::Waiter& w = waiters[my_index]; -// ec.Prewait(&w); +// if (!ec.Prewait(&w)) +// return act(); // if (predicate) { // ec.CancelWait(&w); // return act(); @@ -50,78 +51,78 @@ class EventCount { public: class Waiter; - EventCount(MaxSizeVector& waiters) : waiters_(waiters) { + EventCount(MaxSizeVector& waiters) + : state_(kStackMask), waiters_(waiters) { eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1); - // Initialize epoch to something close to overflow to test overflow. - state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2); } ~EventCount() { // Ensure there are no waiters. - eigen_plain_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask); + eigen_plain_assert(state_.load() == kStackMask); } // Prewait prepares for waiting. - // After calling this function the thread must re-check the wait predicate - // and call either CancelWait or CommitWait passing the same Waiter object. - void Prewait(Waiter* w) { - w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed); - std::atomic_thread_fence(std::memory_order_seq_cst); + // If Prewait returns true, the thread must re-check the wait predicate + // and then call either CancelWait or CommitWait. + // Otherwise, the thread should assume the predicate may be true + // and don't call CancelWait/CommitWait (there was a concurrent Notify call). + bool Prewait() { + uint64_t state = state_.load(std::memory_order_relaxed); + for (;;) { + CheckState(state); + uint64_t newstate = state + kWaiterInc; + if ((state & kSignalMask) != 0) { + // Consume the signal and cancel waiting. + newstate -= kSignalInc + kWaiterInc; + } + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_seq_cst)) + return (state & kSignalMask) == 0; + } } - // CommitWait commits waiting. + // CommitWait commits waiting after Prewait. void CommitWait(Waiter* w) { + eigen_plain_assert((w->epoch & ~kEpochMask) == 0); w->state = Waiter::kNotSignaled; - // Modification epoch of this waiter. - uint64_t epoch = - (w->epoch & kEpochMask) + - (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + const uint64_t me = (w - &waiters_[0]) | w->epoch; uint64_t state = state_.load(std::memory_order_seq_cst); for (;;) { - if (int64_t((state & kEpochMask) - epoch) < 0) { - // The preceding waiter has not decided on its fate. Wait until it - // calls either CancelWait or CommitWait, or is notified. - EIGEN_THREAD_YIELD(); - state = state_.load(std::memory_order_seq_cst); - continue; + CheckState(state, true); + uint64_t newstate; + if ((state & kSignalMask) != 0) { + // Consume the signal and return immidiately. + newstate = state - kWaiterInc - kSignalInc; + } else { + // Remove this thread from pre-wait counter and add to the waiter stack. + newstate = ((state & kWaiterMask) - kWaiterInc) | me; + w->next.store(state & (kStackMask | kEpochMask), + std::memory_order_relaxed); } - // We've already been notified. - if (int64_t((state & kEpochMask) - epoch) > 0) return; - // Remove this thread from prewait counter and add it to the waiter list. - eigen_plain_assert((state & kWaiterMask) != 0); - uint64_t newstate = state - kWaiterInc + kEpochInc; - newstate = (newstate & ~kStackMask) | (w - &waiters_[0]); - if ((state & kStackMask) == kStackMask) - w->next.store(nullptr, std::memory_order_relaxed); - else - w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed); + CheckState(newstate); if (state_.compare_exchange_weak(state, newstate, - std::memory_order_release)) - break; + std::memory_order_acq_rel)) { + if ((state & kSignalMask) == 0) { + w->epoch += kEpochInc; + Park(w); + } + return; + } } - Park(w); } // CancelWait cancels effects of the previous Prewait call. - void CancelWait(Waiter* w) { - uint64_t epoch = - (w->epoch & kEpochMask) + - (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + void CancelWait() { uint64_t state = state_.load(std::memory_order_relaxed); for (;;) { - if (int64_t((state & kEpochMask) - epoch) < 0) { - // The preceding waiter has not decided on its fate. Wait until it - // calls either CancelWait or CommitWait, or is notified. - EIGEN_THREAD_YIELD(); - state = state_.load(std::memory_order_relaxed); - continue; - } - // We've already been notified. - if (int64_t((state & kEpochMask) - epoch) > 0) return; - // Remove this thread from prewait counter. - eigen_plain_assert((state & kWaiterMask) != 0); - if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc, - std::memory_order_relaxed)) + CheckState(state, true); + uint64_t newstate = state - kWaiterInc; + // Also take away a signal if any. + if ((state & kSignalMask) != 0) newstate -= kSignalInc; + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acq_rel)) return; } } @@ -132,35 +133,33 @@ class EventCount { std::atomic_thread_fence(std::memory_order_seq_cst); uint64_t state = state_.load(std::memory_order_acquire); for (;;) { + CheckState(state); + const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + const uint64_t signals = (state & kSignalMask) >> kSignalShift; // Easy case: no waiters. - if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0) - return; - uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + if ((state & kStackMask) == kStackMask && waiters == signals) return; uint64_t newstate; if (notifyAll) { - // Reset prewait counter and empty wait list. - newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; - } else if (waiters) { + // Empty wait stack and set signal to number of pre-wait threads. + newstate = + (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask; + } else if (signals < waiters) { // There is a thread in pre-wait state, unblock it. - newstate = state + kEpochInc - kWaiterInc; + newstate = state + kSignalInc; } else { // Pop a waiter from list and unpark it. Waiter* w = &waiters_[state & kStackMask]; - Waiter* wnext = w->next.load(std::memory_order_relaxed); - uint64_t next = kStackMask; - if (wnext != nullptr) next = wnext - &waiters_[0]; - // Note: we don't add kEpochInc here. ABA problem on the lock-free stack - // can't happen because a waiter is re-pushed onto the stack only after - // it was in the pre-wait state which inevitably leads to epoch - // increment. - newstate = (state & kEpochMask) + next; + uint64_t next = w->next.load(std::memory_order_relaxed); + newstate = (state & (kWaiterMask | kSignalMask)) | next; } + CheckState(newstate); if (state_.compare_exchange_weak(state, newstate, - std::memory_order_acquire)) { - if (!notifyAll && waiters) return; // unblocked pre-wait thread + std::memory_order_acq_rel)) { + if (!notifyAll && (signals < waiters)) + return; // unblocked pre-wait thread if ((state & kStackMask) == kStackMask) return; Waiter* w = &waiters_[state & kStackMask]; - if (!notifyAll) w->next.store(nullptr, std::memory_order_relaxed); + if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed); Unpark(w); return; } @@ -171,11 +170,11 @@ class EventCount { friend class EventCount; // Align to 128 byte boundary to prevent false sharing with other Waiter // objects in the same vector. - EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic next; + EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic next; std::mutex mu; std::condition_variable cv; - uint64_t epoch; - unsigned state; + uint64_t epoch = 0; + unsigned state = kNotSignaled; enum { kNotSignaled, kWaiting, @@ -185,23 +184,41 @@ class EventCount { private: // State_ layout: - // - low kStackBits is a stack of waiters committed wait. + // - low kWaiterBits is a stack of waiters committed wait + // (indexes in waiters_ array are used as stack elements, + // kStackMask means empty stack). // - next kWaiterBits is count of waiters in prewait state. - // - next kEpochBits is modification counter. - static const uint64_t kStackBits = 16; - static const uint64_t kStackMask = (1ull << kStackBits) - 1; - static const uint64_t kWaiterBits = 16; - static const uint64_t kWaiterShift = 16; + // - next kWaiterBits is count of pending signals. + // - remaining bits are ABA counter for the stack. + // (stored in Waiter node and incremented on push). + static const uint64_t kWaiterBits = 14; + static const uint64_t kStackMask = (1ull << kWaiterBits) - 1; + static const uint64_t kWaiterShift = kWaiterBits; static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) << kWaiterShift; - static const uint64_t kWaiterInc = 1ull << kWaiterBits; - static const uint64_t kEpochBits = 32; - static const uint64_t kEpochShift = 32; + static const uint64_t kWaiterInc = 1ull << kWaiterShift; + static const uint64_t kSignalShift = 2 * kWaiterBits; + static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1) + << kSignalShift; + static const uint64_t kSignalInc = 1ull << kSignalShift; + static const uint64_t kEpochShift = 3 * kWaiterBits; + static const uint64_t kEpochBits = 64 - kEpochShift; static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; static const uint64_t kEpochInc = 1ull << kEpochShift; std::atomic state_; MaxSizeVector& waiters_; + static void CheckState(uint64_t state, bool waiter = false) { + static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem"); + const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + const uint64_t signals = (state & kSignalMask) >> kSignalShift; + eigen_plain_assert(waiters >= signals); + eigen_plain_assert(waiters < (1 << kWaiterBits) - 1); + eigen_plain_assert(!waiter || waiters > 0); + (void)waiters; + (void)signals; + } + void Park(Waiter* w) { std::unique_lock lock(w->mu); while (w->state != Waiter::kSignaled) { @@ -210,10 +227,10 @@ class EventCount { } } - void Unpark(Waiter* waiters) { - Waiter* next = nullptr; - for (Waiter* w = waiters; w; w = next) { - next = w->next.load(std::memory_order_relaxed); + void Unpark(Waiter* w) { + for (Waiter* next; w; w = next) { + uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask; + next = wnext == kStackMask ? nullptr : &waiters_[wnext]; unsigned state; { std::unique_lock lock(w->mu); diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 8fafcdab5..49603d6c1 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -374,11 +374,11 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { eigen_plain_assert(!t->f); // We already did best-effort emptiness check in Steal, so prepare for // blocking. - ec_.Prewait(waiter); + if (!ec_.Prewait()) return true; // Now do a reliable emptiness check. int victim = NonEmptyQueueIndex(); if (victim != -1) { - ec_.CancelWait(waiter); + ec_.CancelWait(); if (cancelled_) { return false; } else { @@ -392,7 +392,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { blocked_++; // TODO is blocked_ required to be unsigned? if (done_ && blocked_ == static_cast(num_threads_)) { - ec_.CancelWait(waiter); + ec_.CancelWait(); // Almost done, but need to re-check queues. // Consider that all queues are empty and all worker threads are preempted // right after incrementing blocked_ above. Now a free-standing thread diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 2f1418684..3ca8598c7 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -30,11 +30,11 @@ static void test_basic_eventcount() EventCount ec(waiters); EventCount::Waiter& w = waiters[0]; ec.Notify(false); - ec.Prewait(&w); + VERIFY(ec.Prewait()); ec.Notify(true); ec.CommitWait(&w); - ec.Prewait(&w); - ec.CancelWait(&w); + VERIFY(ec.Prewait()); + ec.CancelWait(); } // Fake bounded counter-based queue. @@ -112,7 +112,7 @@ static void test_stress_eventcount() unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; j--; - ec.Prewait(&w); + if (!ec.Prewait()) continue; bool empty = true; for (int q = 0; q < kQueues; q++) { if (!queues[q].Empty()) { @@ -121,7 +121,7 @@ static void test_stress_eventcount() } } if (!empty) { - ec.CancelWait(&w); + ec.CancelWait(); continue; } ec.CommitWait(&w); From e409dbba141024e9c443969ee61dd7d1f17ee688 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 23 Feb 2019 10:45:40 +0100 Subject: [PATCH 257/295] Enable SSE vectorization of Quaternion and cross3() with AVX --- Eigen/Geometry | 5 ++-- Eigen/src/Geometry/arch/Geometry_SSE.h | 39 ++++++++++++++++---------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Eigen/Geometry b/Eigen/Geometry index 04aa316cb..16b4bd6e1 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -49,9 +49,8 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX +// Use the SSE optimized version whenever possible. +#if defined EIGEN_VECTORIZE_SSE #include "src/Geometry/arch/Geometry_SSE.h" #endif diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index d4346aa1c..108cc9f8e 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -25,10 +25,12 @@ struct quat_product }; static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); Quaternion res; const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - Packet4f a = _a.coeffs().template packet(0); - Packet4f b = _b.coeffs().template packet(0); + Packet4f a = ae.template packet(0); + Packet4f b = be.template packet(0); Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); pstoret( @@ -50,9 +52,10 @@ struct quat_conj }; static inline Quaternion run(const QuaternionBase& q) { + evaluator qe(q.coeffs()); Quaternion res; - const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); + const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); + pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); return res; } }; @@ -67,12 +70,14 @@ struct cross3_impl static inline typename plain_matrix_type::type run(const VectorLhs& lhs, const VectorRhs& rhs) { - __m128 a = lhs.template packet::Alignment>(0); - __m128 b = rhs.template packet::Alignment>(0); - __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); + evaluator lhs_eval(lhs); + evaluator rhs_eval(rhs); + Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); + Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); typename plain_matrix_type::type res; - pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); + pstoret(&res.x(),psub(mul1,mul2)); return res; } }; @@ -94,9 +99,12 @@ struct quat_product Quaternion res; + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); + Packet2d b_xy = be.template packet(0); + Packet2d b_zw = be.template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -145,11 +153,12 @@ struct quat_conj }; static inline Quaternion run(const QuaternionBase& q) { + evaluator qe(q.coeffs()); Quaternion res; - const __m128d mask0 = _mm_setr_pd(-0.,-0.); - const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); - pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); + const Packet2d mask0 = _mm_setr_pd(-0.,-0.); + const Packet2d mask2 = _mm_setr_pd(-0.,0.); + pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); + pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); return res; } }; From c181dfb8ab0a009795858670b4236d7e64a0c79e Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Wed, 27 Feb 2019 11:30:58 -0800 Subject: [PATCH 258/295] Consistently use EIGEN_BLAS_FUNC in BLAS. Previously, for a few functions, eithe BLASFUNC or, EIGEN_CAT was being used. This change uses EIGEN_BLAS_FUNC consistently everywhere. Also introduce EIGEN_BLAS_FUNC_SUFFIX, which by default is equal to "_", this allows the user to inject a new suffix as needed. --- blas/common.h | 6 +++++- blas/double.cpp | 2 +- blas/level1_cplx_impl.h | 8 ++++---- blas/level1_impl.h | 4 ++-- blas/single.cpp | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/blas/common.h b/blas/common.h index 960c09cc6..a9b697842 100644 --- a/blas/common.h +++ b/blas/common.h @@ -166,6 +166,10 @@ T* copy_back(T* x_cpy, T* x, int n, int incx) return x_cpy; } -#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX,X##_) +#ifndef EIGEN_BLAS_FUNC_SUFFIX +#define EIGEN_BLAS_FUNC_SUFFIX _ +#endif + +#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX, EIGEN_CAT(X, EIGEN_BLAS_FUNC_SUFFIX)) #endif // EIGEN_BLAS_COMMON_H diff --git a/blas/double.cpp b/blas/double.cpp index 295b1d1f2..eb2e57307 100644 --- a/blas/double.cpp +++ b/blas/double.cpp @@ -19,7 +19,7 @@ #include "level2_real_impl.h" #include "level3_impl.h" -double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy) +double EIGEN_BLAS_FUNC(sdot)(int* n, float* x, int* incx, float* y, int* incy) { if(*n<=0) return 0; diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h index 719f5bac9..4ac457175 100644 --- a/blas/level1_cplx_impl.h +++ b/blas/level1_cplx_impl.h @@ -25,7 +25,7 @@ namespace Eigen { // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n -RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, RealScalar *px, int *incx) +RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(asum))(int *n, RealScalar *px, int *incx) { // std::cerr << "__asum " << *n << " " << *incx << "\n"; Complex* x = reinterpret_cast(px); @@ -81,7 +81,7 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in return 0; } -RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, RealScalar *px, int *incx) +RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(nrm2))(int *n, RealScalar *px, int *incx) { // std::cerr << "__nrm2 " << *n << " " << *incx << "\n"; if(*n<=0) return 0; @@ -94,7 +94,7 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, return make_vector(x,*n,*incx).stableNorm(); } -int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) +int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, rot))(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) { if(*n<=0) return 0; @@ -117,7 +117,7 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal return 0; } -int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealScalar *palpha, RealScalar *px, int *incx) +int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, scal))(int *n, RealScalar *palpha, RealScalar *px, int *incx) { if(*n<=0) return 0; diff --git a/blas/level1_impl.h b/blas/level1_impl.h index 6e7f8c976..d3ee03477 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -51,7 +51,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int return 0; } -int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *incx) +int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amax))(int *n, RealScalar *px, int *incx) { if(*n<=0) return 0; Scalar* x = reinterpret_cast(px); @@ -62,7 +62,7 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc return int(ret)+1; } -int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *incx) +int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amin))(int *n, RealScalar *px, int *incx) { if(*n<=0) return 0; Scalar* x = reinterpret_cast(px); diff --git a/blas/single.cpp b/blas/single.cpp index 20ea57d5c..e66879aea 100644 --- a/blas/single.cpp +++ b/blas/single.cpp @@ -18,5 +18,5 @@ #include "level2_real_impl.h" #include "level3_impl.h" -float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy) +float EIGEN_BLAS_FUNC(dsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy) { return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); } From cf4a1c81fa27ae94634ba7e86da06d6849ebc474 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Mar 2019 14:21:09 -0800 Subject: [PATCH 259/295] Fix specialization for conjugate on non-complex types in TensorBase.h. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index dd008fe05..dbacf494e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -244,11 +244,11 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, + EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, TensorCwiseUnaryOp, const Derived>, Derived>::type conjugate() const { - return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); } EIGEN_DEVICE_FUNC From b071672e78e6b7170e10bd658696505ae1215568 Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Wed, 27 Feb 2019 11:09:00 +0100 Subject: [PATCH 260/295] Do not keep latex logs to make package builds more reproducible. See https://reproducible-builds.org/ for why this is good. --- doc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index aa36d7891..f344ae572 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -106,7 +106,7 @@ add_custom_target(doc ALL COMMAND doxygen Doxyfile-unsupported COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc - COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz + COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz eigen-doc/unsupported/_formulas.log eigen-doc/_formulas.log COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc COMMAND ${CMAKE_COMMAND} -E rename eigen-doc.tgz eigen-doc/eigen-doc.tgz COMMAND ${CMAKE_COMMAND} -E rename eigen-doc html From 22144e949dbc218ddf499c72da5135f487aecd11 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 2 Mar 2019 22:44:47 +0100 Subject: [PATCH 261/295] bug #1629: fix compilation of PardisoSupport (regression introduced in changeset a7842daef2c82a9be200dff54d455f6d4a0b199c ) --- Eigen/src/PardisoSupport/PardisoSupport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 70afcb3fe..07006b5c4 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -123,7 +123,7 @@ class PardisoImpl : public SparseSolverBase }; PardisoImpl() - : m_analysisIsOk(false), m_factorizationIsOk(false), m_pt(0) + : m_analysisIsOk(false), m_factorizationIsOk(false) { eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type"); m_iparm.setZero(); From 9ba81cf0ff5461bb7876787ac9f80aadc2a4844d Mon Sep 17 00:00:00 2001 From: Sam Hasinoff Date: Sat, 2 Mar 2019 17:42:16 +0000 Subject: [PATCH 262/295] Fully qualify Eigen::internal::aligned_free This helps avoids a conflict on certain Windows toolchains (potentially due to some ADL name resolution bug) in the case where aligned_free is defined in the global namespace. In any case, tightening this up is harmless. --- Eigen/src/Core/util/Memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index efd747217..d25f65e65 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -378,7 +378,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); - aligned_free(ptr); + Eigen::internal::aligned_free(ptr); } /** \internal Deletes objects constructed with conditional_aligned_new From b0d406d91c62ff32153df43d5f698ceb02341ac7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 3 Mar 2019 15:25:25 +0100 Subject: [PATCH 263/295] Enable construction of Ref from a runtime vector. --- Eigen/src/Core/Ref.h | 5 +++-- test/ref.cpp | 12 ++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index ac9502bc4..172c8ffb6 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -28,12 +28,13 @@ struct traits > template struct match { enum { + IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime, HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), - OuterStrideMatch = Derived::IsVectorAtCompileTime + OuterStrideMatch = IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), // NOTE, this indirection of evaluator::Alignment is needed // to workaround a very strange bug in MSVC related to the instantiation diff --git a/test/ref.cpp b/test/ref.cpp index 250135bdb..c0b6ffdcf 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -102,10 +102,14 @@ template void ref_vector(const VectorType& m) Index i = internal::random(0,size-1); Index bsize = internal::random(1,size-i); - RefMat rm0 = v1; - VERIFY_IS_EQUAL(rm0, v1); - RefDynMat rv1 = v1; - VERIFY_IS_EQUAL(rv1, v1); + { RefMat rm0 = v1; VERIFY_IS_EQUAL(rm0, v1); } + { RefMat rm0 = v1.block(0,0,size,1); VERIFY_IS_EQUAL(rm0, v1); } + { RefDynMat rv1 = v1; VERIFY_IS_EQUAL(rv1, v1); } + { RefDynMat rv1 = v1.block(0,0,size,1); VERIFY_IS_EQUAL(rv1, v1); } + { VERIFY_RAISES_ASSERT( RefMat rm0 = v1.block(0, 0, size, 0); EIGEN_UNUSED_VARIABLE(rm0); ); } + if(VectorType::SizeAtCompileTime!=1) + { VERIFY_RAISES_ASSERT( RefDynMat rv1 = v1.block(0, 0, size, 0); EIGEN_UNUSED_VARIABLE(rv1); ); } + RefDynMat rv2 = v1.segment(i,bsize); VERIFY_IS_EQUAL(rv2, v1.segment(i,bsize)); rv2.setOnes(); From 694084ecbd12c5183a8ff0604d04971d043abfff Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Mar 2019 11:10:21 -0800 Subject: [PATCH 264/295] Use fast divisors in TensorGeneratorOp --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index ac66f9cf1..0fee18fb6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -98,6 +98,8 @@ struct TensorEvaluator, Device> RawAccess = false }; + typedef internal::TensorIntDivisor IndexDivisor; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_generator(op.generator()) #ifdef EIGEN_USE_SYCL @@ -118,6 +120,9 @@ struct TensorEvaluator, Device> m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; } } + for (int i = 0; i < NumDims; ++i) { + m_fast_strides[i] = IndexDivisor(m_strides[i]); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -150,6 +155,8 @@ struct TensorEvaluator, Device> return rslt; } + // TODO(ezhulenev): Add tiled evaluation support. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make @@ -170,14 +177,14 @@ struct TensorEvaluator, Device> void extract_coordinates(Index index, array& coords) const { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } coords[0] = index; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; + const Index idx = index / m_fast_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } @@ -187,6 +194,7 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; array m_strides; + array m_fast_strides; Generator m_generator; #ifdef EIGEN_USE_SYCL TensorEvaluator m_argImpl; From b95941e5c2cf8886a54e510be662cf4ecadc4f6f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Mar 2019 16:02:22 -0800 Subject: [PATCH 265/295] Add tiled evaluation for TensorForcedEvalOp --- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 25 +++++++++--- unsupported/test/cxx11_tensor_executor.cpp | 39 ++++++++++++++++++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 78068be35..74b905329 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -90,14 +90,21 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = true, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = true + Layout = TensorEvaluator::Layout, + RawAccess = true }; + typedef typename internal::TensorBlock< + CoeffReturnType, Index, internal::traits::NumDimensions, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + CoeffReturnType, Index, internal::traits::NumDimensions, Layout> + TensorBlockReader; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) @@ -139,6 +146,14 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector*) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_buffer != NULL); + TensorBlockReader::Run(block, m_buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 18c87b35e..608306613 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -452,6 +452,38 @@ static void test_execute_slice_lvalue(Device d) } } +template +static void test_execute_broadcasting_of_forced_eval(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(1, 10); + Tensor src(dims); + src.setRandom(); + + const auto broadcasts = RandomDims(1, 7); + const auto expr = src.square().eval().broadcast(broadcasts); + + // We assume that broadcasting on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor dst(golden.dimensions()); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART @@ -528,8 +560,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4); CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); + // Force CMake to split this test. - // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11 + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12 } #undef CALL_SUBTEST_COMBINATIONS From efb5080d313a38cd204942237689de0bda412732 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Mar 2019 16:58:49 -0800 Subject: [PATCH 266/295] Do not initialize invalid fast_strides in TensorGeneratorOp --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 0fee18fb6..cf1e821a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -113,16 +113,15 @@ struct TensorEvaluator, Device> m_strides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } - for (int i = 0; i < NumDims; ++i) { - m_fast_strides[i] = IndexDivisor(m_strides[i]); - } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } From 0318fc7f44a3440995494c835b859a1dabe0e4c8 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 5 Mar 2019 10:24:54 -0800 Subject: [PATCH 267/295] Remove EIGEN_MPL2_ONLY guards around code re-licensed from LGPL to MPL2 in https://bitbucket.org/eigen/eigen/commits/2ca1e732398ea2c506427e9031212d63e9253b96 --- Eigen/OrderingMethods | 3 --- Eigen/Sparse | 2 -- Eigen/SparseCholesky | 8 -------- Eigen/src/IterativeLinearSolvers/IncompleteLUT.h | 9 --------- Eigen/src/OrderingMethods/Ordering.h | 4 ---- 5 files changed, 26 deletions(-) diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods index d8ea36193..29691a62b 100644 --- a/Eigen/OrderingMethods +++ b/Eigen/OrderingMethods @@ -63,10 +63,7 @@ * \endcode */ -#ifndef EIGEN_MPL2_ONLY #include "src/OrderingMethods/Amd.h" -#endif - #include "src/OrderingMethods/Ordering.h" #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/Sparse b/Eigen/Sparse index 136e681a1..a2ef7a665 100644 --- a/Eigen/Sparse +++ b/Eigen/Sparse @@ -25,9 +25,7 @@ #include "SparseCore" #include "OrderingMethods" -#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" -#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/Eigen/SparseCholesky b/Eigen/SparseCholesky index b6a320c40..d2b1f1276 100644 --- a/Eigen/SparseCholesky +++ b/Eigen/SparseCholesky @@ -30,16 +30,8 @@ * \endcode */ -#ifdef EIGEN_MPL2_ONLY -#error The SparseCholesky module has nothing to offer in MPL2 only mode -#endif - #include "src/SparseCholesky/SimplicialCholesky.h" - -#ifndef EIGEN_MPL2_ONLY #include "src/SparseCholesky/SimplicialCholesky_impl.h" -#endif - #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SPARSECHOLESKY_MODULE_H diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 43bd8e8f6..09436cb67 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -225,7 +225,6 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) // Compute the Fill-reducing permutation // Since ILUT does not perform any numerical pivoting, // it is highly preferable to keep the diagonal through symmetric permutations. -#ifndef EIGEN_MPL2_ONLY // To this end, let's symmetrize the pattern and perform AMD on it. SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); @@ -235,14 +234,6 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) AMDOrdering ordering; ordering(AtA,m_P); m_Pinv = m_P.inverse(); // cache the inverse permutation -#else - // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine. - SparseMatrix mat1 = amat; - COLAMDOrdering ordering; - ordering(mat1,m_Pinv); - m_P = m_Pinv.inverse(); -#endif - m_analysisIsOk = true; m_factorizationIsOk = false; m_isInitialized = true; diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 34dbef487..8791158be 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -38,8 +38,6 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) } -#ifndef EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class AMDOrdering * @@ -81,8 +79,6 @@ class AMDOrdering } }; -#endif // EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class NaturalOrdering * From b1a862749362572a0c1075e2381e6dd58a71c3fb Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 5 Mar 2019 11:19:25 -0800 Subject: [PATCH 268/295] Do not create Tensor in cxx11_tensor_forced_eval test --- unsupported/test/cxx11_tensor_forced_eval.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp index f76e2ea97..a21a02bec 100644 --- a/unsupported/test/cxx11_tensor_forced_eval.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -61,7 +61,7 @@ static void test_const() Eigen::array bcast; bcast[0] = 3; bcast[1] = 1; - const TensorMap > input_tensor(input.data(), 3, 3); + const TensorMap > input_tensor(input.data(), 3, 3); Tensor output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); for (int i = 0; i < 3; ++i) { From 56c6373f82d4cf42a489951fb08566d3e5b612ef Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 5 Mar 2019 11:51:26 -0800 Subject: [PATCH 269/295] Add an extra check for the RunQueue size estimate --- unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index ecdc35f81..a9ae05fc6 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -209,7 +209,9 @@ class RunQueue { } else { // This value will be 0 if the queue is empty, and undefined otherwise. unsigned maybe_zero = ((front ^ back) & kMask2); - eigen_assert(maybe_zero == 0 ? CalculateSize(front, back) == 0 : true); + // Queue size estimate must agree with maybe zero check on the queue + // empty/non-empty state. + eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0)); return maybe_zero; } } From a407e022e6046917b1ebeacd54b03fcb079a9706 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 5 Mar 2019 14:19:59 -0800 Subject: [PATCH 270/295] Tune tensor contraction threadpool heuristics --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d7cd995fb..adf57c892 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -216,11 +216,14 @@ struct TensorEvaluatorm_device.numThreadsInPool(); // With small number of threads we want to make sure that we do not reduce - // parallelism too much. - const int oversharding_factor = - num_worker_threads <= 4 ? 8 : - num_worker_threads <= 8 ? 4 : - num_worker_threads <= 16 ? 2 : 1; + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = + num_worker_threads <= 4 ? 8.0 : + num_worker_threads <= 8 ? 4.0 : + num_worker_threads <= 16 ? 2.0 : + num_worker_threads <= 32 ? 1.0 : + num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; const bool parallelize_by_sharding_dim_only = sharding_dim_tasks >= oversharding_factor * num_worker_threads; From bfbf7da0478afe75e19a953f0925bbd492bcd427 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 5 Mar 2019 23:46:24 +0100 Subject: [PATCH 271/295] bug #1689 fix used-but-marked-unused warning --- Eigen/src/Core/products/Parallelizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 92e9b0d9f..e01e798f1 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -21,7 +21,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads); if(action==SetAction) { From 5d9a6686ed0e526092973ce2823c15825ffd39b6 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 5 Mar 2019 16:35:21 -0800 Subject: [PATCH 272/295] Block evaluation for TensorGeneratorOp --- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 85 +++++++++++++++++-- unsupported/test/cxx11_tensor_executor.cpp | 50 ++++++++++- unsupported/test/cxx11_tensor_generator.cpp | 6 +- 3 files changed, 128 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index cf1e821a9..4662d5aea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -89,19 +89,22 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; typedef internal::TensorIntDivisor IndexDivisor; + typedef internal::TensorBlock + TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + : m_device(device), m_generator(op.generator()) #ifdef EIGEN_USE_SYCL , m_argImpl(op.expression(), device) #endif @@ -154,7 +157,70 @@ struct TensorEvaluator, Device> return rslt; } - // TODO(ezhulenev): Add tiled evaluation support. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + Eigen::Index block_total_size_max = numext::maxi( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) return; + + static const bool is_col_major = + static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(output_block->first_coeff_index(), coords); + array initial_coords = coords; + + CoeffReturnType* data = output_block->data(); + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (Index i = 0; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = output_block->block_sizes()[dim]; + it[i].stride = output_block->block_strides()[dim]; + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Generate data for the inner-most dimension. + for (Index i = 0; i < it[0].size; ++i) { + *(data + offset + i) = m_generator(coords); + coords[is_col_major ? 0 : NumDims - 1]++; + } + coords[is_col_major ? 0 : NumDims - 1] = + initial_coords[is_col_major ? 0 : NumDims - 1]; + + // Update offset. + for (Index i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { @@ -191,6 +257,7 @@ struct TensorEvaluator, Device> } } + const Device& m_device; Dimensions m_dimensions; array m_strides; array m_fast_strides; diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 608306613..162dab7b8 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -484,6 +484,49 @@ static void test_execute_broadcasting_of_forced_eval(Device d) } } +template +struct DummyGenerator { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + T operator()(const array & dims) const { + T result = static_cast(0); + for (int i = 0; i < NumDims; ++i) { + result += static_cast((i + 1) * dims[i]); + } + return result; + } +}; + +template +static void test_execute_generator_op(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims(20, 30); + Tensor src(dims); + src.setRandom(); + + const auto expr = src.generate(DummyGenerator()); + + // We assume that generator on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor dst(golden.dimensions()); + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART @@ -565,8 +608,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); + // Force CMake to split this test. - // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12 + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13 } #undef CALL_SUBTEST_COMBINATIONS diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp index ee5e29b77..6dcf676bb 100644 --- a/unsupported/test/cxx11_tensor_generator.cpp +++ b/unsupported/test/cxx11_tensor_generator.cpp @@ -42,11 +42,11 @@ struct Generator2D { template static void test_2D() { - Tensor matrix(5, 7); + Tensor matrix(512, 512); Tensor result = matrix.generate(Generator2D()); - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 512; ++i) { + for (int j = 0; j < 512; ++j) { VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); } } From 25abaa2e4182b916a688794d312f56a8bd750d91 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 5 Mar 2019 17:34:35 -0800 Subject: [PATCH 273/295] Check that inner block dimension is continuous --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 4662d5aea..24a7dd0ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -197,6 +197,7 @@ struct TensorEvaluator, Device> it[i].span = it[i].stride * (it[i].size - 1); it[i].count = 0; } + eigen_assert(it[0].stride == 1); while (it[NumDims - 1].count < it[NumDims - 1].size) { // Generate data for the inner-most dimension. From 4e4dcd9026ed36c074170c13d4092eddaec5b285 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 6 Mar 2019 10:39:07 -0800 Subject: [PATCH 274/295] Remove redundant steal loop --- .../Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 49603d6c1..115e39d07 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -56,6 +56,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { thread_data_[i].thread.reset( env_.CreateThread([this, i]() { WorkerLoop(i); })); } + global_steal_partition_ = EncodePartition(0, num_threads_); #ifndef EIGEN_THREAD_LOCAL // Wait for workers to initialize per_thread_map_. Otherwise we might race // with them in Schedule or CurrentThreadId. @@ -237,6 +238,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { MaxSizeVector thread_data_; MaxSizeVector> all_coprimes_; MaxSizeVector waiters_; + unsigned global_steal_partition_; std::atomic blocked_; std::atomic spinning_; std::atomic done_; @@ -354,6 +356,9 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { Task LocalSteal() { PerThread* pt = GetPerThread(); unsigned partition = GetStealPartition(pt->thread_id); + // If thread steal partition is the same as global partition, there is no + // need to go through the steal loop twice. + if (global_steal_partition_ == partition) Task(); unsigned start, limit; DecodePartition(partition, &start, &limit); AssertBounds(start, limit); From 1bc2a0a57c5054754749dcb3235597098a208eaf Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 6 Mar 2019 10:49:49 -0800 Subject: [PATCH 275/295] Add missing return to NonBlockingThreadPool::LocalSteal --- unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 115e39d07..bd1910dcc 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -358,7 +358,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { unsigned partition = GetStealPartition(pt->thread_id); // If thread steal partition is the same as global partition, there is no // need to go through the steal loop twice. - if (global_steal_partition_ == partition) Task(); + if (global_steal_partition_ == partition) return Task(); unsigned start, limit; DecodePartition(partition, &start, &limit); AssertBounds(start, limit); From cc407c9d4d0fdc60348642b74c89f08a041cd2a2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 6 Mar 2019 11:40:06 -0800 Subject: [PATCH 276/295] Fix placement of "#if defined(EIGEN_GPUCC)" guard region. Found with -Wundefined-func-template. Author: tkoeppe@google.com --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1c44541bd..057e90e50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -317,6 +317,7 @@ class TensorExecutor class TensorExecutor { @@ -326,7 +327,6 @@ class TensorExecutor { }; -#if defined(EIGEN_GPUCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE From f4ec8edea8a8396e1b744db9ea61de2c451bd15d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 6 Mar 2019 11:52:04 -0800 Subject: [PATCH 277/295] Add macro EIGEN_AVOID_THREAD_LOCAL to make it possible to manually disable the use of thread_local. --- unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h index 7229839ac..696c2d03b 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h @@ -10,6 +10,14 @@ #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H +#ifdef EIGEN_AVOID_THREAD_LOCAL + +#ifdef EIGEN_THREAD_LOCAL +#undef EIGEN_THREAD_LOCAL +#endif + +#else + #if EIGEN_MAX_CPP_VER >= 11 && \ ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \ __has_feature(cxx_thread_local) || \ @@ -52,4 +60,6 @@ #endif #endif // defined(__ANDROID__) && defined(__clang__) +#endif // EIGEN_AVOID_THREAD_LOCAL + #endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H From 2df4f0024666a9085fe47f14e2290bd61676dbbd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Mar 2019 18:17:10 +0100 Subject: [PATCH 278/295] Change license from LGPL to MPL2 with agreement from David Harmon. --- .../ArpackSelfAdjointEigenSolver.h | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h index 3c6cfb1e3..0fbd84772 100644 --- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h @@ -3,24 +3,9 @@ // // Copyright (C) 2012 David Harmon // -// Eigen is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 3 of the License, or (at your option) any later version. -// -// Alternatively, you can redistribute it and/or -// modify it under the terms of the GNU General Public License as -// published by the Free Software Foundation; either version 2 of -// the License, or (at your option) any later version. -// -// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License and a copy of the GNU General Public License along with -// Eigen. If not, see . +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H #define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H From 656d9bc66baf2accf22256df1d2c0c7c89676040 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 10 Mar 2019 21:19:18 +0100 Subject: [PATCH 279/295] Apply SSE's pmin/pmax fix for GCC <= 5 to AVX's pmin/pmax --- Eigen/src/Core/arch/AVX/PacketMath.h | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index ee00f1f7d..f88e36024 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -209,20 +209,51 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& #endif template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may flip + // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet8f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else // Arguments are swapped to match NaN propagation behavior of std::min. return _mm256_min_ps(b,a); +#endif } template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else // Arguments are swapped to match NaN propagation behavior of std::min. return _mm256_min_pd(b,a); +#endif } template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet8f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_ps(b,a); +#endif } template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else // Arguments are swapped to match NaN propagation behavior of std::max. return _mm256_max_pd(b,a); +#endif } template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } From 0f8bfff23df9375080fb08de947c623f652fe27b Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 11 Mar 2019 09:38:44 -0700 Subject: [PATCH 280/295] Fix a data race in NonBlockingThreadPool --- unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index bd1910dcc..9e54254c1 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -29,6 +29,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { thread_data_(num_threads), all_coprimes_(num_threads), waiters_(num_threads), + global_steal_partition_(EncodePartition(0, num_threads_)), blocked_(0), spinning_(0), done_(false), @@ -56,7 +57,6 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { thread_data_[i].thread.reset( env_.CreateThread([this, i]() { WorkerLoop(i); })); } - global_steal_partition_ = EncodePartition(0, num_threads_); #ifndef EIGEN_THREAD_LOCAL // Wait for workers to initialize per_thread_map_. Otherwise we might race // with them in Schedule or CurrentThreadId. From 899c16fa2cec396a7d75dff020f6f755834f5961 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 11 Mar 2019 09:42:01 -0700 Subject: [PATCH 281/295] Fix a bug in TensorGenerator for 1d tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 24a7dd0ca..204a6fd33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -208,6 +208,9 @@ struct TensorEvaluator, Device> coords[is_col_major ? 0 : NumDims - 1] = initial_coords[is_col_major ? 0 : NumDims - 1]; + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + // Update offset. for (Index i = 1; i < NumDims; ++i) { if (++it[i].count < it[i].size) { From 001f10e3c94608d74898e8911a8a42d3f0640a1a Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 11 Mar 2019 09:43:33 -0700 Subject: [PATCH 282/295] Fix segfaults with cuda compilation --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 057e90e50..e2ff11129 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -317,7 +317,6 @@ class TensorExecutor class TensorExecutor { @@ -326,7 +325,7 @@ class TensorExecutor { static void run(const Expression& expr, const GpuDevice& device); }; - +#if defined(EIGEN_GPUCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE From 77f7d4a894a633c2b8b72221a7b5f81e8d140182 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 11 Mar 2019 17:51:16 -0700 Subject: [PATCH 283/295] Clean up PacketMathHalf.h and add a few missing logical packet ops. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 80 ++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 316ac0283..5917ec1b8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -137,15 +137,21 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half2 result; - unsigned temp = *(reinterpret_cast(&(a))); - *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; - return result; + half a1 = __low2half(a); + half a2 = __high2half(a); + half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); + half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); + return __halves2half2(result1, result2); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { - half2 result; - *(reinterpret_cast(&(result))) = 0xffffffffu; + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -175,6 +181,68 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen: #endif } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; + return __halves2half2(eq1, eq2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); + return __halves2half2(result1, result2); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) From 90302aa8c9510085ff443fff67812c03fd091c4e Mon Sep 17 00:00:00 2001 From: Thomas Capricelli Date: Tue, 12 Mar 2019 13:47:01 +0100 Subject: [PATCH 284/295] update tracking code --- doc/eigendoxy_footer.html.in | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in index 9ac0596cb..76e215267 100644 --- a/doc/eigendoxy_footer.html.in +++ b/doc/eigendoxy_footer.html.in @@ -17,18 +17,22 @@ $generatedby   - + - + var _paq = _paq || []; + /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ + _paq.push(['trackPageView']); + _paq.push(['enableLinkTracking']); + (function() { + var u="//stats.sylphide-consulting.com/matomo/"; + _paq.push(['setTrackerUrl', u+'piwik.php']); + _paq.push(['setSiteId', '1']); + var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; + g.type='text/javascript'; g.async=true; g.defer=true; g.src=u+'piwik.js'; s.parentNode.insertBefore(g,s); + })(); + + + From 190143367472dc6209329a38af6143d271878488 Mon Sep 17 00:00:00 2001 From: Thomas Capricelli Date: Tue, 12 Mar 2019 13:53:38 +0100 Subject: [PATCH 285/295] erm.. use proper id --- doc/eigendoxy_footer.html.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in index 76e215267..94f2bab71 100644 --- a/doc/eigendoxy_footer.html.in +++ b/doc/eigendoxy_footer.html.in @@ -26,12 +26,12 @@ $generatedby   (function() { var u="//stats.sylphide-consulting.com/matomo/"; _paq.push(['setTrackerUrl', u+'piwik.php']); - _paq.push(['setSiteId', '1']); + _paq.push(['setSiteId', '20']); var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.type='text/javascript'; g.async=true; g.defer=true; g.src=u+'piwik.js'; s.parentNode.insertBefore(g,s); })(); - + From d7d2f0680e97e0279c04d736df2c63d064b2b2c0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Mar 2019 10:40:01 +0100 Subject: [PATCH 286/295] bug #1684: partially workaround clang's 6/7 bug #40815 --- Eigen/src/Core/functors/AssignmentFunctors.h | 11 +++++- test/array_reverse.cpp | 40 ++++++++++++++++++-- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 9765cc763..bf64ef4ed 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -157,7 +157,16 @@ template struct functor_traits > { enum { Cost = 3 * NumTraits::ReadCost, - PacketAccess = packet_traits::Vectorizable + PacketAccess = + #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__)) + // This is a partial workaround for a bug in clang generating bad code + // when mixing 256/512 bits loads and 128 bits moves. + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684 + // https://bugs.llvm.org/show_bug.cgi?id=40815 + 0 + #else + packet_traits::Vectorizable + #endif }; }; diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index b19a6b356..adc9bdbfb 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -134,24 +134,56 @@ void array_reverse_extra() // Simpler version of reverseInPlace leveraging a bug // in clang 6/7 with -O2 and AVX or AVX512 enabled. -// This simpler version ensure that the clang bug is not hidden +// This simpler version ensure that the clang bug is not simply hidden // through mis-inlining of reverseInPlace or other minor changes. template EIGEN_DONT_INLINE -void bug1684_work(MatrixType& m1, MatrixType& m2) +void bug1684_job1(MatrixType& m1, MatrixType& m2) { m2 = m1; m2.col(0).swap(m2.col(3)); m2.col(1).swap(m2.col(2)); } +template +EIGEN_DONT_INLINE +void bug1684_job2(MatrixType& m1, MatrixType& m2) +{ + m2 = m1; // load m1/m2 in AVX registers + m1.col(0) = m2.col(3); // perform 128 bits moves + m1.col(1) = m2.col(2); + m1.col(2) = m2.col(1); + m1.col(3) = m2.col(0); +} + +template +EIGEN_DONT_INLINE +void bug1684_job3(MatrixType& m1, MatrixType& m2) +{ + m2 = m1; + Vector4f tmp; + tmp = m2.col(0); + m2.col(0) = m2.col(3); + m2.col(3) = tmp; + tmp = m2.col(1); + m2.col(1) = m2.col(2); + m2.col(2) = tmp; + +} + template void bug1684() { Matrix4f m1 = Matrix4f::Random(); Matrix4f m2 = Matrix4f::Random(); - bug1684_work(m1,m2); + bug1684_job1(m1,m2); VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval()); + bug1684_job2(m1,m2); + VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval()); + // This one still fail after our swap's workaround, + // but I expect users not to implement their own swap. + // bug1684_job3(m1,m2); + // VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval()); } EIGEN_DECLARE_TEST(array_reverse) @@ -159,7 +191,7 @@ EIGEN_DECLARE_TEST(array_reverse) for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( reverse(Matrix()) ); CALL_SUBTEST_2( reverse(Matrix2f()) ); - CALL_SUBTEST_3( reverse(Matrix4f()) ); + // CALL_SUBTEST_3( reverse(Matrix4f()) ); CALL_SUBTEST_4( reverse(Matrix4d()) ); CALL_SUBTEST_5( reverse(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( reverse(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); From 6a340031416e91e6004f5c0dcb94b373571e79d7 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 13 Mar 2019 11:52:41 -0700 Subject: [PATCH 287/295] Remove EIGEN_MPL2_ONLY guard in IncompleteCholesky that is no longer needed after the AMD reordering code was relicensed to MPL2. --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 5a827c52c..e5d0308ec 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -41,13 +41,7 @@ namespace Eigen { * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ -template -#else -NaturalOrdering -#endif -> +template > class IncompleteCholesky : public SparseSolverBase > { protected: From 45ab514fe2a2d0b63d6a8552814ce3de7687d50d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Mar 2019 10:08:12 +0100 Subject: [PATCH 288/295] revert debug stuff --- test/array_reverse.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index adc9bdbfb..c77528a5b 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -191,7 +191,7 @@ EIGEN_DECLARE_TEST(array_reverse) for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( reverse(Matrix()) ); CALL_SUBTEST_2( reverse(Matrix2f()) ); - // CALL_SUBTEST_3( reverse(Matrix4f()) ); + CALL_SUBTEST_3( reverse(Matrix4f()) ); CALL_SUBTEST_4( reverse(Matrix4d()) ); CALL_SUBTEST_5( reverse(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( reverse(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); From 97f9a46cb90bc66e402c3873b5bf7d6ed58252d0 Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Thu, 14 Mar 2019 10:18:24 +0100 Subject: [PATCH 289/295] PR 593: Add variadtic ctor for DiagonalMatrix with unit tests --- Eigen/src/Core/DiagonalMatrix.h | 24 ++++ Eigen/src/Core/PlainObjectBase.h | 2 +- test/CMakeLists.txt | 1 + test/diagonal_matrix_variadic_ctor.cpp | 185 +++++++++++++++++++++++++ 4 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 test/diagonal_matrix_variadic_ctor.cpp diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index afab2f1b6..542685c65 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -178,6 +178,30 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} + #if EIGEN_HAS_CXX11 + /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 + * + * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. + * + * \warning To construct a diagonal matrix of fixed size, the number of values passed to this + * constructor must match the fixed dimension of \c *this. + * + * \sa DiagonalMatrix(const Scalar&, const Scalar&) + * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args) + : m_diagonal(a0, a1, a2, args...) {} + + /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) + : m_diagonal(list) {} + #endif // EIGEN_HAS_CXX11 + /** Copy constructor. */ template EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index bae186ecb..4dbc40e5b 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -532,7 +532,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \only_for_vectors * * This constructor is for 1D array or vectors with more than 4 coefficients. - * There exists c++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * There exists C++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. * * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this * constructor must match the the fixed number of rows (resp. columns) of \c *this. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f74e22c28..8c58f2a33 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -290,6 +290,7 @@ ei_add_test(num_dimensions) ei_add_test(stl_iterators) if(EIGEN_TEST_CXX11) ei_add_test(initializer_list_construction) + ei_add_test(diagonal_matrix_variadic_ctor) endif() add_executable(bug1213 bug1213.cpp bug1213_main.cpp) diff --git a/test/diagonal_matrix_variadic_ctor.cpp b/test/diagonal_matrix_variadic_ctor.cpp new file mode 100644 index 000000000..fbc8f8470 --- /dev/null +++ b/test/diagonal_matrix_variadic_ctor.cpp @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 David Tellenbach +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_STATIC_ASSERT + +#include "main.h" + +template +void assertionTest() +{ + typedef DiagonalMatrix DiagMatrix5; + typedef DiagonalMatrix DiagMatrix7; + typedef DiagonalMatrix DiagMatrixX; + + Scalar raw[6]; + for (int i = 0; i < 6; ++i) { + raw[i] = internal::random(); + } + + VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[2], raw[3]})); + VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[3]})); + VERIFY_RAISES_ASSERT((DiagMatrix7{raw[0], raw[1], raw[2], raw[3]})); + + VERIFY_RAISES_ASSERT((DiagMatrixX { + {raw[0], raw[1], raw[2]}, + {raw[3], raw[4], raw[5]} + })); +} + +#define VERIFY_IMPLICIT_CONVERSION_3(DIAGTYPE, V0, V1, V2) \ + DIAGTYPE d(V0, V1, V2); \ + DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix(); \ + VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0); \ + VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1); \ + VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2); + +#define VERIFY_IMPLICIT_CONVERSION_4(DIAGTYPE, V0, V1, V2, V3) \ + DIAGTYPE d(V0, V1, V2, V3); \ + DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix(); \ + VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0); \ + VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1); \ + VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2); \ + VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3); + +#define VERIFY_IMPLICIT_CONVERSION_5(DIAGTYPE, V0, V1, V2, V3, V4) \ + DIAGTYPE d(V0, V1, V2, V3, V4); \ + DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix(); \ + VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0); \ + VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1); \ + VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2); \ + VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3); \ + VERIFY_IS_APPROX(Dense(4, 4), (Scalar)V4); + +template +void constructorTest() +{ + typedef DiagonalMatrix DiagonalMatrix0; + typedef DiagonalMatrix DiagonalMatrix3; + typedef DiagonalMatrix DiagonalMatrix4; + typedef DiagonalMatrix DiagonalMatrixX; + + Scalar raw[7]; + for (int k = 0; k < 7; ++k) raw[k] = internal::random(); + + // Fixed-sized matrices + { + DiagonalMatrix0 a {{}}; + VERIFY(a.rows() == 0); + VERIFY(a.cols() == 0); + typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}}; + VERIFY(a.rows() == 3); + VERIFY(a.cols() == 3); + typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}}; + VERIFY(a.rows() == 4); + VERIFY(a.cols() == 4); + typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + + // dynamically sized matrices + { + DiagonalMatrixX a{{}}; + VERIFY(a.rows() == 0); + VERIFY(a.rows() == 0); + typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}}; + VERIFY(a.rows() == 7); + VERIFY(a.rows() == 7); + typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } +} + +template<> +void constructorTest() +{ + typedef float Scalar; + + typedef DiagonalMatrix DiagonalMatrix0; + typedef DiagonalMatrix DiagonalMatrix3; + typedef DiagonalMatrix DiagonalMatrix4; + typedef DiagonalMatrix DiagonalMatrix5; + typedef DiagonalMatrix DiagonalMatrixX; + + Scalar raw[7]; + for (int k = 0; k < 7; ++k) raw[k] = internal::random(); + + // Fixed-sized matrices + { + DiagonalMatrix0 a {{}}; + VERIFY(a.rows() == 0); + VERIFY(a.cols() == 0); + typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}}; + VERIFY(a.rows() == 3); + VERIFY(a.cols() == 3); + typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}}; + VERIFY(a.rows() == 4); + VERIFY(a.cols() == 4); + typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + + // dynamically sized matrices + { + DiagonalMatrixX a{{}}; + VERIFY(a.rows() == 0); + VERIFY(a.rows() == 0); + typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { + DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}}; + VERIFY(a.rows() == 7); + VERIFY(a.rows() == 7); + typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix(); + for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]); + } + { VERIFY_IMPLICIT_CONVERSION_3(DiagonalMatrix3, 1.2647, 2.56f, -3); } + { VERIFY_IMPLICIT_CONVERSION_4(DiagonalMatrix4, 1.2647, 2.56f, -3, 3.23f); } + { VERIFY_IMPLICIT_CONVERSION_5(DiagonalMatrix5, 1.2647, 2.56f, -3, 3.23f, 2); } +} + +EIGEN_DECLARE_TEST(diagonal_matrix_variadic_ctor) +{ + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest()); + CALL_SUBTEST_1(assertionTest>()); + + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest()); + CALL_SUBTEST_2(constructorTest>()); +} From b013176e520ffacad723ef4a3eb772d373d7313e Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Thu, 14 Mar 2019 11:40:28 +0100 Subject: [PATCH 290/295] Remove undefined std::complex --- test/initializer_list_construction.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/initializer_list_construction.cpp b/test/initializer_list_construction.cpp index 0d1c6f2f3..b84e9ba72 100644 --- a/test/initializer_list_construction.cpp +++ b/test/initializer_list_construction.cpp @@ -340,7 +340,6 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_1(initializerListVectorConstruction()); CALL_SUBTEST_1(initializerListVectorConstruction()); CALL_SUBTEST_1(initializerListVectorConstruction()); - CALL_SUBTEST_1(initializerListVectorConstruction>()); CALL_SUBTEST_1(initializerListVectorConstruction>()); CALL_SUBTEST_1(initializerListVectorConstruction>()); @@ -350,7 +349,6 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_2(initializerListMatrixConstruction()); CALL_SUBTEST_2(initializerListMatrixConstruction()); CALL_SUBTEST_2(initializerListMatrixConstruction()); - CALL_SUBTEST_2(initializerListMatrixConstruction>()); CALL_SUBTEST_2(initializerListMatrixConstruction>()); CALL_SUBTEST_2(initializerListMatrixConstruction>()); @@ -360,7 +358,6 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_3(initializerListArrayConstruction()); CALL_SUBTEST_3(initializerListArrayConstruction()); CALL_SUBTEST_3(initializerListArrayConstruction()); - CALL_SUBTEST_3(initializerListArrayConstruction>()); CALL_SUBTEST_3(initializerListArrayConstruction>()); CALL_SUBTEST_3(initializerListArrayConstruction>()); @@ -370,7 +367,6 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction()); - CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction>()); CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction3<0>()); @@ -384,7 +380,6 @@ EIGEN_DECLARE_TEST(initializer_list_construction) CALL_SUBTEST_6(dynamicVectorConstruction()); CALL_SUBTEST_6(dynamicVectorConstruction()); CALL_SUBTEST_6(dynamicVectorConstruction()); - CALL_SUBTEST_6(dynamicVectorConstruction>()); CALL_SUBTEST_6(dynamicVectorConstruction>()); CALL_SUBTEST_6(dynamicVectorConstruction>()); } \ No newline at end of file From 8450a6d519454f318f490c797e089c2f0fc540f2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Mar 2019 15:18:06 -0700 Subject: [PATCH 291/295] Clean up half packet traits and add a few more missing packet ops. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 73 ++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 5917ec1b8..869fa7ec6 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -30,6 +30,7 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 0, HasAdd = 1, + HasSub = 1, HasMul = 1, HasDiv = 1, HasSqrt = 1, @@ -572,6 +573,7 @@ struct packet_traits : default_packet_traits { HasAdd = 1, HasSub = 1, HasMul = 1, + HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, @@ -579,7 +581,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -770,6 +771,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, con return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pdiv(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { Packet16f from_float = half2float(from); return half(predux(from_float)); @@ -1054,6 +1062,7 @@ struct packet_traits : default_packet_traits { HasAdd = 1, HasSub = 1, HasMul = 1, + HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, @@ -1061,7 +1070,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -1221,6 +1229,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { Packet8h result; @@ -1407,9 +1422,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 4, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, @@ -1417,7 +1433,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -1464,6 +1479,29 @@ template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const return result; } +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { __int64_t a64 = _mm_cvtm64_si64(a.x); __int64_t b64 = _mm_cvtm64_si64(b.x); @@ -1487,6 +1525,29 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const return result; } +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { Packet4h result; result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); From bd9c2ae3fde7457308ef20abdf92b060241490bd Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Fri, 15 Mar 2019 15:29:17 +0100 Subject: [PATCH 292/295] Fix include guard comments --- unsupported/Eigen/src/Skyline/SkylineInplaceLU.h | 2 +- unsupported/Eigen/src/Skyline/SkylineMatrix.h | 2 +- unsupported/Eigen/src/Skyline/SkylineMatrixBase.h | 2 +- unsupported/Eigen/src/Skyline/SkylineStorage.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h index bda057a85..6d0370d5b 100644 --- a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +++ b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h @@ -349,4 +349,4 @@ bool SkylineInplaceLU::solve(const MatrixBase &b, MatrixBa } // end namespace Eigen -#endif // EIGEN_SKYLINELU_H +#endif // EIGEN_SKYLINEINPLACELU_H diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h index f77d79a04..7c7eace7f 100644 --- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h +++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h @@ -859,4 +859,4 @@ protected: } // end namespace Eigen -#endif // EIGEN_SkylineMatrix_H +#endif // EIGEN_SKYLINEMATRIX_H diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h index b3a237230..753c1b33d 100644 --- a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +++ b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h @@ -209,4 +209,4 @@ protected: } // end namespace Eigen -#endif // EIGEN_SkylineMatrixBase_H +#endif // EIGEN_SKYLINEMATRIXBASE_H diff --git a/unsupported/Eigen/src/Skyline/SkylineStorage.h b/unsupported/Eigen/src/Skyline/SkylineStorage.h index 378a8deb4..cc7514f12 100644 --- a/unsupported/Eigen/src/Skyline/SkylineStorage.h +++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h @@ -256,4 +256,4 @@ public: } // end namespace Eigen -#endif // EIGEN_COMPRESSED_STORAGE_H +#endif // EIGEN_SKYLINE_STORAGE_H From e42f9aa68a53a0a85f7c6ee257c25428c955eea2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 15 Mar 2019 17:15:14 -0700 Subject: [PATCH 293/295] Make clipping outside [-18:18] consistent for vectorized and non-vectorized paths of scalar_logistic_. --- Eigen/src/Core/functors/UnaryFunctors.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 1d5eb3678..03f167ac9 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -927,8 +927,9 @@ template <> struct scalar_logistic_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { - const float one = 1.0f; - return one / (one + numext::exp(-x)); + if (x < -18.0f) return 0.0f; + else if (x > 18.0f) return 1.0f; + else return 1.0f / (1.0f + numext::exp(-x)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE From cf7e2e277f76d965fe37dac5a3fecf588f58e3b7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 17 Mar 2019 21:59:30 +0100 Subject: [PATCH 294/295] bug #1692: enable enum as sizes of Matrix and Array --- Eigen/src/Core/PlainObjectBase.h | 12 +++++++----- test/constructor.cpp | 13 +++++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 4dbc40e5b..6de78fd2f 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -802,8 +802,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) { - EIGEN_STATIC_ASSERT(bool(NumTraits::IsInteger) && - bool(NumTraits::IsInteger), + const bool t0_is_integer_alike = internal::is_valid_index_type::value; + const bool t1_is_integer_alike = internal::is_valid_index_type::value; + EIGEN_STATIC_ASSERT(t0_is_integer_alike && + t1_is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } @@ -838,9 +840,9 @@ class PlainObjectBase : public internal::dense_xpr_base::type && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) { // NOTE MSVC 2008 complains if we directly put bool(NumTraits::IsInteger) as the EIGEN_STATIC_ASSERT argument. - const bool is_integer = NumTraits::IsInteger; - EIGEN_UNUSED_VARIABLE(is_integer); - EIGEN_STATIC_ASSERT(is_integer, + const bool is_integer_alike = internal::is_valid_index_type::value; + EIGEN_UNUSED_VARIABLE(is_integer_alike); + EIGEN_STATIC_ASSERT(is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } diff --git a/test/constructor.cpp b/test/constructor.cpp index 1dd3bc3c0..e793dfca1 100644 --- a/test/constructor.cpp +++ b/test/constructor.cpp @@ -81,4 +81,17 @@ EIGEN_DECLARE_TEST(constructor) Array a(123); VERIFY_IS_EQUAL(a(4), 123.f); } + { + enum { M = 12, N = 7}; + MatrixXi m1(M,N); + VERIFY_IS_EQUAL(m1.rows(),M); + VERIFY_IS_EQUAL(m1.cols(),N); + ArrayXXi a1(M,N); + VERIFY_IS_EQUAL(a1.rows(),M); + VERIFY_IS_EQUAL(a1.cols(),N); + VectorXi v1(M); + VERIFY_IS_EQUAL(v1.size(),M); + ArrayXi a2(M); + VERIFY_IS_EQUAL(a2.size(),M); + } } From 48898a988a5159d2f3c0ff00bd737d17b202e844 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 18 Mar 2019 11:38:36 +0100 Subject: [PATCH 295/295] fix unit test in c++03: c++03 does not allow passing local or anonymous enum as template param --- test/constructor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/constructor.cpp b/test/constructor.cpp index e793dfca1..ffd5e802a 100644 --- a/test/constructor.cpp +++ b/test/constructor.cpp @@ -20,6 +20,8 @@ template struct Wrapper inline operator MatrixType& () { return m_mat; } }; +enum my_sizes { M = 12, N = 7}; + template void ctor_init1(const MatrixType& m) { // Check logic in PlainObjectBase::_init1 @@ -82,7 +84,6 @@ EIGEN_DECLARE_TEST(constructor) VERIFY_IS_EQUAL(a(4), 123.f); } { - enum { M = 12, N = 7}; MatrixXi m1(M,N); VERIFY_IS_EQUAL(m1.rows(),M); VERIFY_IS_EQUAL(m1.cols(),N);