From d6d39c7ddb127d91ebfa4ea62e93ea51036f1760 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 7 Jun 2016 14:35:08 -0700 Subject: [PATCH 01/31] Added missing EIGEN_DEVICE_FUNC --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 52cfc2824..d34f1e328 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -148,7 +148,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } - const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } protected: TensorEvaluator m_impl; From 8fd57a97f203edac3f7e8681eafe752294386a24 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 7 Jun 2016 18:22:18 -0700 Subject: [PATCH 02/31] Enable the vectorization of adds and mults of fp16 --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 51386506f..959dff886 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -28,6 +28,8 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=2, HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, HasDiv = 1, HasSqrt = 1, HasRsqrt = 1, From 9dd9d58273362d643eaa0b8f4f16f8aa3d5ef6cd Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 8 Jun 2016 15:36:42 +0200 Subject: [PATCH 03/31] Copied a regression test from 3.2 branch. --- test/geo_homogeneous.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp index bf63c69ec..305794cdf 100644 --- a/test/geo_homogeneous.cpp +++ b/test/geo_homogeneous.cpp @@ -58,6 +58,8 @@ template void homogeneous(void) T2MatrixType t2 = T2MatrixType::Random(); VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous()); VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous()); + VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal()); + VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2); VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2, v0.transpose().rowwise().homogeneous() * t2); From 9fc8379328dad5fb74249003f56fb608c304ae4d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Jun 2016 16:39:11 +0200 Subject: [PATCH 04/31] Fix extraction of complex eigenvalue pairs in real generalized eigenvalue problems. --- .../src/Eigenvalues/GeneralizedEigenSolver.h | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index a9d6790d5..07a9ccf46 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -327,13 +327,33 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } else { - Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1)); - Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1))); - m_alphas.coeffRef(i) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z); - m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z); + // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a triangular 2x2 block T + // From the eigen decomposition of T = U * E * U^-1, + // we can extract the eigenvalues of (U^-1 * S * U) / E + // Here, we can take advantage that E = diag(T), and U = [ 1 T_01 ; 0 T_11-T_00], and U^-1 = [1 -T_11/(T_11-T_00) ; 0 1/(T_11-T_00)]. + // Then taking beta=T_00*T_11*(T_11-T_00), we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00) * (T_11-T_00): + + // T = [a b ; 0 c] + // S = [e f ; g h] + RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i, i+1), c = m_realQZ.matrixT().coeff(i+1, i+1); + RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1); + RealScalar d = c-a; + RealScalar gb = g*b; + Matrix A; + A << (e*d-gb)*c, ((e*b+f*d-h*b)*d-gb*b)*a, + g*c , (gb+h*d)*a; + + // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal, + // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00): + + Scalar p = Scalar(0.5) * (A.coeff(i, i) - A.coeff(i+1, i+1)); + Scalar z = sqrt(abs(p * p + A.coeff(i+1, i) * A.coeff(i, i+1))); + m_alphas.coeffRef(i) = ComplexScalar(A.coeff(i+1, i+1) + p, z); + m_alphas.coeffRef(i+1) = ComplexScalar(A.coeff(i+1, i+1) + p, -z); + + m_betas.coeffRef(i) = + m_betas.coeffRef(i+1) = a*c*d; - m_betas.coeffRef(i) = m_realQZ.matrixT().coeff(i,i); - m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i); i += 2; } } From df095cab104550a8179c28e93d477406dfab6849 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Jun 2016 18:31:19 +0200 Subject: [PATCH 05/31] Fixes for PARDISO: warnings, and defaults to metis+ in-core mode. --- Eigen/src/PardisoSupport/PardisoSupport.h | 35 ++++++++++++++--------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 80d914f25..091c3970e 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -183,7 +183,7 @@ class PardisoImpl : public SparseSolverBase { if(m_isInitialized) // Factorization ran at least once { - internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0, + internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, -1, internal::convert_index(m_size),0, 0, 0, m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); m_isInitialized = false; } @@ -194,11 +194,11 @@ class PardisoImpl : public SparseSolverBase m_type = type; bool symmetric = std::abs(m_type) < 10; m_iparm[0] = 1; // No solver default - m_iparm[1] = 3; // use Metis for the ordering - m_iparm[2] = 1; // Numbers of processors, value of OMP_NUM_THREADS + m_iparm[1] = 2; // use Metis for the ordering + m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??) m_iparm[3] = 0; // No iterative-direct algorithm m_iparm[4] = 0; // No user fill-in reducing permutation - m_iparm[5] = 0; // Write solution into x + m_iparm[5] = 0; // Write solution into x, b is left unchanged m_iparm[6] = 0; // Not in use m_iparm[7] = 2; // Max numbers of iterative refinement steps m_iparm[8] = 0; // Not in use @@ -219,7 +219,8 @@ class PardisoImpl : public SparseSolverBase m_iparm[26] = 0; // No matrix checker m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0; m_iparm[34] = 1; // C indexing - m_iparm[59] = 1; // Automatic switch between In-Core and Out-of-Core modes + m_iparm[36] = 0; // CSR + m_iparm[59] = 0; // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core memset(m_pt, 0, sizeof(m_pt)); } @@ -246,7 +247,7 @@ class PardisoImpl : public SparseSolverBase mutable SparseMatrixType m_matrix; mutable ComputationInfo m_info; bool m_analysisIsOk, m_factorizationIsOk; - Index m_type, m_msglvl; + StorageIndex m_type, m_msglvl; mutable void *m_pt[64]; mutable ParameterType m_iparm; mutable IntColVectorType m_perm; @@ -265,10 +266,9 @@ Derived& PardisoImpl::compute(const MatrixType& a) derived().getMatrix(a); Index error; - error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 12, m_size, + error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 12, internal::convert_index(m_size), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); - manageErrorCode(error); m_analysisIsOk = true; m_factorizationIsOk = true; @@ -287,7 +287,7 @@ Derived& PardisoImpl::analyzePattern(const MatrixType& a) derived().getMatrix(a); Index error; - error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 11, m_size, + error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 11, internal::convert_index(m_size), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); @@ -306,8 +306,8 @@ Derived& PardisoImpl::factorize(const MatrixType& a) derived().getMatrix(a); - Index error; - error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 22, m_size, + Index error; + error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 22, internal::convert_index(m_size), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); @@ -354,9 +354,9 @@ void PardisoImpl::_solve_impl(const MatrixBase &b, MatrixBase } Index error; - error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 33, m_size, + error = internal::pardiso_run_selector::run(m_pt, 1, 1, m_type, 33, internal::convert_index(m_size), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), - m_perm.data(), nrhs, m_iparm.data(), m_msglvl, + m_perm.data(), internal::convert_index(nrhs), m_iparm.data(), m_msglvl, rhs_ptr, x.derived().data()); manageErrorCode(error); @@ -371,6 +371,9 @@ void PardisoImpl::_solve_impl(const MatrixBase &b, MatrixBase * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible. * The vectors or matrices X and B can be either dense or sparse. * + * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set: + * \code solver.pardisoParameterArray()[59] = 1; \endcode + * * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * * \implsparsesolverconcept @@ -421,6 +424,9 @@ class PardisoLU : public PardisoImpl< PardisoLU > * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite. * The vectors or matrices X and B can be either dense or sparse. * + * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set: + * \code solver.pardisoParameterArray()[59] = 1; \endcode + * * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used. * Upper|Lower can be used to tell both triangular parts can be used as input. @@ -480,6 +486,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > * For complex matrices, A can also be symmetric only, see the \a Options template parameter. * The vectors or matrices X and B can be either dense or sparse. * + * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set: + * \code solver.pardisoParameterArray()[59] = 1; \endcode + * * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used. * Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix. From 0beabb4776b887c25977186b5af6811eb49aaa23 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Wed, 8 Jun 2016 16:12:04 -0400 Subject: [PATCH 06/31] Fixed type conversion from int --- Eigen/src/Geometry/Transform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 1a06c1e35..073f4dcd1 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -1367,7 +1367,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); Matrix rhs; - rhs.template head() = other; rhs[Dim] = 1; + rhs.template head() = other; rhs[Dim] = typename ResultType::Scalar(1); Matrix res(T.matrix() * rhs); return res.template head(); } From a20d2ec1c02d2629cbfa8871898b64cd22445595 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 16:16:22 +0200 Subject: [PATCH 07/31] Fix shadow variable, and indexing. --- Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 07a9ccf46..08caed281 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -339,17 +339,17 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1); RealScalar d = c-a; RealScalar gb = g*b; - Matrix A; - A << (e*d-gb)*c, ((e*b+f*d-h*b)*d-gb*b)*a, - g*c , (gb+h*d)*a; + Matrix S2; + S2 << (e*d-gb)*c, ((e*b+f*d-h*b)*d-gb*b)*a, + g*c , (gb+h*d)*a; // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal, // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00): - Scalar p = Scalar(0.5) * (A.coeff(i, i) - A.coeff(i+1, i+1)); - Scalar z = sqrt(abs(p * p + A.coeff(i+1, i) * A.coeff(i, i+1))); - m_alphas.coeffRef(i) = ComplexScalar(A.coeff(i+1, i+1) + p, z); - m_alphas.coeffRef(i+1) = ComplexScalar(A.coeff(i+1, i+1) + p, -z); + Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1)); + Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1))); + m_alphas.coeffRef(i) = ComplexScalar(S2.coeff(1,1) + p, z); + m_alphas.coeffRef(i+1) = ComplexScalar(S2.coeff(1,1) + p, -z); m_betas.coeffRef(i) = m_betas.coeffRef(i+1) = a*c*d; From 15890c304edbccedc8a989468ed3fc475f428059 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 16:17:27 +0200 Subject: [PATCH 08/31] Add unit test for non symmetric generalized eigenvalues --- test/eigensolver_generalized_real.cpp | 32 ++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp index a46a2e50e..da14482de 100644 --- a/test/eigensolver_generalized_real.cpp +++ b/test/eigensolver_generalized_real.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2012 Gael Guennebaud +// Copyright (C) 2012-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,6 +10,7 @@ #include "main.h" #include #include +#include template void generalized_eigensolver_real(const MatrixType& m) { @@ -21,6 +22,7 @@ template void generalized_eigensolver_real(const MatrixType Index cols = m.cols(); typedef typename MatrixType::Scalar Scalar; + typedef std::complex ComplexScalar; typedef Matrix VectorType; MatrixType a = MatrixType::Random(rows,cols); @@ -31,14 +33,28 @@ template void generalized_eigensolver_real(const MatrixType MatrixType spdB = b.adjoint() * b + b1.adjoint() * b1; // lets compare to GeneralizedSelfAdjointEigenSolver - GeneralizedSelfAdjointEigenSolver symmEig(spdA, spdB); - GeneralizedEigenSolver eig(spdA, spdB); + { + GeneralizedSelfAdjointEigenSolver symmEig(spdA, spdB); + GeneralizedEigenSolver eig(spdA, spdB); - VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0); + VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0); - VectorType realEigenvalues = eig.eigenvalues().real(); - std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size()); - VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues()); + VectorType realEigenvalues = eig.eigenvalues().real(); + std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size()); + VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues()); + } + + // non symmetric case: + { + GeneralizedEigenSolver eig(a,b); + for(Index k=0; k tmp = (eig.betas()(k)*a).template cast() - eig.alphas()(k)*b; + if(tmp.norm()>(std::numeric_limits::min)()) + tmp /= tmp.norm(); + VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) ); + } + } // regression test for bug 1098 { @@ -57,7 +73,7 @@ void test_eigensolver_generalized_real() s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) ); - // some trivial but implementation-wise tricky cases + // some trivial but implementation-wise special cases CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) ); CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) ); CALL_SUBTEST_3( generalized_eigensolver_real(Matrix()) ); From c1f9ca925405c8fad126f327b4bdca7f983fc96e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 17:11:03 +0200 Subject: [PATCH 09/31] Update RealQZ to reduce 2x2 diagonal block of T corresponding to non reduced diagonal block of S to positive diagonal form. This step involve a real 2x2 SVD problem. The respective routine is thus in src/misc/ to be shared by both EVD and AVD modules. --- Eigen/Eigenvalues | 1 + Eigen/SVD | 1 + Eigen/src/Eigenvalues/RealQZ.h | 32 +++++++++++++++++++- Eigen/src/SVD/JacobiSVD.h | 32 -------------------- Eigen/src/misc/RealSvd2x2.h | 54 ++++++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 33 deletions(-) create mode 100644 Eigen/src/misc/RealSvd2x2.h diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues index ea93eb303..216a6d51d 100644 --- a/Eigen/Eigenvalues +++ b/Eigen/Eigenvalues @@ -32,6 +32,7 @@ * \endcode */ +#include "src/misc/RealSvd2x2.h" #include "src/Eigenvalues/Tridiagonalization.h" #include "src/Eigenvalues/RealSchur.h" #include "src/Eigenvalues/EigenSolver.h" diff --git a/Eigen/SVD b/Eigen/SVD index b353f3f54..565d9c90d 100644 --- a/Eigen/SVD +++ b/Eigen/SVD @@ -31,6 +31,7 @@ * \endcode */ +#include "src/misc/RealSvd2x2.h" #include "src/SVD/UpperBidiagonalization.h" #include "src/SVD/SVDBase.h" #include "src/SVD/JacobiSVD.h" diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index a62071d42..c4715b954 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -552,7 +552,6 @@ namespace Eigen { m_T.coeffRef(l,l-1) = Scalar(0.0); } - template RealQZ& RealQZ::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ) { @@ -616,6 +615,37 @@ namespace Eigen { } // check if we converged before reaching iterations limit m_info = (local_iter j_left, j_right; + internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right); + + // Apply resulting Jacobi rotations + m_T.applyOnTheLeft(i,i+1,j_left); + m_T.applyOnTheRight(i,i+1,j_right); + m_S.applyOnTheLeft(i,i+1,j_left); + m_S.applyOnTheRight(i,i+1,j_right); + m_T(i,i+1) = Scalar(0); + + if(m_computeQZ) { + m_Q.applyOnTheRight(i,i+1,j_left.transpose()); + m_Z.applyOnTheLeft(i,i+1,j_right.transpose()); + } + + i++; + } + } + } + return *this; } // end compute diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 1940c8294..b83fd7a4d 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -419,38 +419,6 @@ struct svd_precondition_2x2_block_to_be_real } }; -template -void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, - JacobiRotation *j_left, - JacobiRotation *j_right) -{ - using std::sqrt; - using std::abs; - Matrix m; - m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), - numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); - JacobiRotation rot1; - RealScalar t = m.coeff(0,0) + m.coeff(1,1); - RealScalar d = m.coeff(1,0) - m.coeff(0,1); - if(d == RealScalar(0)) - { - rot1.s() = RealScalar(0); - rot1.c() = RealScalar(1); - } - else - { - // If d!=0, then t/d cannot overflow because the magnitude of the - // entries forming d are not too small compared to the ones forming t. - RealScalar u = t / d; - RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u)); - rot1.s() = RealScalar(1) / tmp; - rot1.c() = u / tmp; - } - m.applyOnTheLeft(0,1,rot1); - j_right->makeJacobi(m,0,1); - *j_left = rot1 * j_right->transpose(); -} - template struct traits > { diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h new file mode 100644 index 000000000..cdd7777d2 --- /dev/null +++ b/Eigen/src/misc/RealSvd2x2.h @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2010 Benoit Jacob +// Copyright (C) 2013-2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_REALSVD2X2_H +#define EIGEN_REALSVD2X2_H + +namespace Eigen { + +namespace internal { + +template +void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, + JacobiRotation *j_left, + JacobiRotation *j_right) +{ + using std::sqrt; + using std::abs; + Matrix m; + m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), + numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); + JacobiRotation rot1; + RealScalar t = m.coeff(0,0) + m.coeff(1,1); + RealScalar d = m.coeff(1,0) - m.coeff(0,1); + if(d == RealScalar(0)) + { + rot1.s() = RealScalar(0); + rot1.c() = RealScalar(1); + } + else + { + // If d!=0, then t/d cannot overflow because the magnitude of the + // entries forming d are not too small compared to the ones forming t. + RealScalar u = t / d; + RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u)); + rot1.s() = RealScalar(1) / tmp; + rot1.c() = u / tmp; + } + m.applyOnTheLeft(0,1,rot1); + j_right->makeJacobi(m,0,1); + *j_left = rot1 * j_right->transpose(); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_REALSVD2X2_H \ No newline at end of file From 2bd59b0e0d667dcdcb6b070596a1bf023e3e88f1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 17:12:03 +0200 Subject: [PATCH 10/31] Take advantage that T is already diagonal in the extraction of generalized complex eigenvalues. --- .../src/Eigenvalues/GeneralizedEigenSolver.h | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 08caed281..9f43fd544 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -327,24 +327,13 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } else { - // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a triangular 2x2 block T - // From the eigen decomposition of T = U * E * U^-1, - // we can extract the eigenvalues of (U^-1 * S * U) / E - // Here, we can take advantage that E = diag(T), and U = [ 1 T_01 ; 0 T_11-T_00], and U^-1 = [1 -T_11/(T_11-T_00) ; 0 1/(T_11-T_00)]. - // Then taking beta=T_00*T_11*(T_11-T_00), we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00) * (T_11-T_00): + // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T + // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00): - // T = [a b ; 0 c] - // S = [e f ; g h] - RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i, i+1), c = m_realQZ.matrixT().coeff(i+1, i+1); - RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1); - RealScalar d = c-a; - RealScalar gb = g*b; - Matrix S2; - S2 << (e*d-gb)*c, ((e*b+f*d-h*b)*d-gb*b)*a, - g*c , (gb+h*d)*a; - - // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal, - // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00): + // T = [a 0] + // [0 b] + RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i+1, i+1); + Matrix S2 = m_matS.template block<2,2>(i,i) * Matrix(b,a).asDiagonal(); Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1)); Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1))); @@ -352,7 +341,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp m_alphas.coeffRef(i+1) = ComplexScalar(S2.coeff(1,1) + p, -z); m_betas.coeffRef(i) = - m_betas.coeffRef(i+1) = a*c*d; + m_betas.coeffRef(i+1) = a*b; i += 2; } From e2b383632699684d06ae180b3ad85cfb0189973a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 17:13:33 +0200 Subject: [PATCH 11/31] Include recent changesets that played with product's kernel --- bench/perf_monitoring/gemm/changesets.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index fb3e48e99..d00b4603a 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -44,4 +44,8 @@ before-evaluators 7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) 7591:09a8e2186610 # 3.3-alpha1 7650:b0f3c8f43025 # help clang inlining +8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) +8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes +8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path +8985:d935df21a082 # Remove the rotating kernel. From aa33446dace833fbf06632e586c80119b3d8ac11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 08:22:27 -0700 Subject: [PATCH 12/31] Improved support for vectorization of 16-bit floats --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 86 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorMeta.h | 5 ++ .../CXX11/src/Tensor/TensorReductionCuda.h | 8 +- 3 files changed, 95 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3dd32e9d1..bf52e490f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -84,6 +84,14 @@ struct functor_traits > { }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + // Standard reduction functors template struct SumReducer { @@ -119,6 +127,15 @@ template struct SumReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasAdd + }; +}; + + template struct MeanReducer { static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; @@ -162,6 +179,15 @@ template struct MeanReducer DenseIndex packetCount_; }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasAdd + }; +}; + + template struct MaxReducer { static const bool PacketAccess = packet_traits::HasMax; @@ -195,6 +221,15 @@ template struct MaxReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasMax + }; +}; + + template struct MinReducer { static const bool PacketAccess = packet_traits::HasMin; @@ -228,6 +263,14 @@ template struct MinReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::type::HasMin + }; +}; + template struct ProdReducer { @@ -263,6 +306,14 @@ template struct ProdReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::type::HasMul + }; +}; + struct AndReducer { @@ -280,6 +331,15 @@ struct AndReducer } }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + struct OrReducer { static const bool PacketAccess = false; static const bool IsStateful = false; @@ -295,6 +355,15 @@ struct OrReducer { } }; +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + // Argmin/Argmax reducers template struct ArgMaxTupleReducer { @@ -312,6 +381,15 @@ template struct ArgMaxTupleReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + + template struct ArgMinTupleReducer { static const bool PacketAccess = false; @@ -328,6 +406,14 @@ template struct ArgMinTupleReducer } }; +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + // Random number generation namespace { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b1645d56f..82a905a65 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -54,6 +54,11 @@ struct PacketType { // For CUDA packet types when using a GpuDevice #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template <> + struct PacketType { + typedef half2 type; + static const int size = 2; + }; template <> struct PacketType { typedef float4 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index e82530955..1b4fdd03f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -331,7 +331,7 @@ struct FullReducer { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || - (internal::is_same::value && Op::PacketAccess)); + (internal::is_same::value && reducer_traits::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same::value; @@ -346,7 +346,7 @@ struct FullReducer { return; } - FullReductionLauncher::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs); } }; @@ -608,7 +608,7 @@ struct InnerReducer { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || - (internal::is_same::value && Op::PacketAccess)); + (internal::is_same::value && reducer_traits::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same::value; @@ -627,7 +627,7 @@ struct InnerReducer { return true; } - return InnerReductionLauncher::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + return InnerReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); } }; From 8f92c26319a6e06cce6d0ba2a252521c8096a2c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 08:23:42 -0700 Subject: [PATCH 13/31] Improved code formatting --- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 61df8032d..0d084141d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -122,7 +122,7 @@ struct TensorEvaluator, Device> { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; + return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { From 14a112ee153f7d8554c3a8848e8a0461c7b82f13 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 08:25:22 -0700 Subject: [PATCH 14/31] Use signed integers more consistently to encode the number of threads to use to evaluate a tensor expression. --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 12 ++++++------ .../Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index a60a17049..ee16cde9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -202,7 +202,7 @@ struct TensorEvaluator::numThreads( + int num_threads = TensorCostModel::numThreads( static_cast(n) * m, cost, this->m_device.numThreads()); // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost @@ -301,7 +301,7 @@ struct TensorEvaluator f) const { typedef TensorCostModel CostModel; if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, numThreads()) == 1) { + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { f(0, n); return; } @@ -242,7 +242,7 @@ struct ThreadPoolDevice { // Recursively divide size into halves until we reach block_size. // Division code rounds mid to block_size, so we are guaranteed to get // block_count leaves that do actual computations. - Barrier barrier(block_count); + Barrier barrier(static_cast(block_count)); std::function handleRange; handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { if (last - first <= block_size) { @@ -268,7 +268,7 @@ struct ThreadPoolDevice { private: ThreadPoolInterface* pool_; - size_t num_threads_; + int num_threads_; }; From 66796e843df723eeac04d6dc725f6a8b27a574ba Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 08:50:01 -0700 Subject: [PATCH 15/31] Fixed definition of some of the reducer_traits --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index bf52e490f..e6ff70460 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -131,7 +131,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::type::HasAdd + PacketAccess = packet_traits::type>::HasAdd }; }; @@ -183,7 +183,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::type::HasAdd + PacketAccess = packet_traits::type>::HasAdd }; }; @@ -225,7 +225,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::type::HasMax + PacketAccess = packet_traits::type>::HasMax }; }; @@ -267,7 +267,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::type::HasMin + PacketAccess = packet_traits::type>::HasMin }; }; @@ -310,7 +310,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = PacketType::type::HasMul + PacketAccess = packet_traits::type>::HasMul }; }; From 37638dafd71e39407d22d4269b32d1c73b84feb8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Jun 2016 10:29:52 -0700 Subject: [PATCH 16/31] Simplified the code that dispatches vectorized reductions on GPU --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 10 ++--- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 38 ++++++++++++------- .../CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index e6ff70460..a8e48fced 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -131,7 +131,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = packet_traits::type>::HasAdd + PacketAccess = PacketType::HasAdd }; }; @@ -183,7 +183,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = packet_traits::type>::HasAdd + PacketAccess = PacketType::HasAdd }; }; @@ -225,7 +225,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = packet_traits::type>::HasMax + PacketAccess = PacketType::HasMax }; }; @@ -267,7 +267,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = packet_traits::type>::HasMin + PacketAccess = PacketType::HasMin }; }; @@ -310,7 +310,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = packet_traits::type>::HasMul + PacketAccess = PacketType::HasMul }; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 82a905a65..0f3778e6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -47,27 +47,39 @@ template <> struct max_n_1<0> { // Default packet types template -struct PacketType { +struct PacketType : internal::packet_traits { typedef typename internal::packet_traits::type type; - enum { size = internal::unpacket_traits::size }; }; // For CUDA packet types when using a GpuDevice #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template <> - struct PacketType { +struct PacketType { typedef half2 type; static const int size = 2; - }; -template <> -struct PacketType { - typedef float4 type; - static const int size = 4; -}; -template <> -struct PacketType { - typedef double2 type; - static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 1b4fdd03f..d9bbcd858 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -328,7 +328,7 @@ struct FullReducer { // Unfortunately nvidia doesn't support well exotic types such as complex, // so reduce the scope of the optimized version of the code to the simple case // of floats and half floats. - #ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); From 1fc2746417c8b4bf1645c703b1f99b1871c8d16e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 22:52:37 +0200 Subject: [PATCH 17/31] Make Arrays's ctor/assignment noexcept --- Eigen/src/Core/Array.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index c0af4aa9d..7c2e0de16 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -149,7 +149,7 @@ class Array #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC - Array(Array&& other) + Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); @@ -157,7 +157,7 @@ class Array Base::_set_noalias(other); } EIGEN_DEVICE_FUNC - Array& operator=(Array&& other) + Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { other.swap(*this); return *this; From bd212438217dc3e169a35052f78e2e41a7ce3a3d Mon Sep 17 00:00:00 2001 From: Sean Templeton Date: Fri, 3 Jun 2016 10:51:35 -0500 Subject: [PATCH 18/31] Fix compile errors initializing packets on ARM DS-5 5.20 The ARM DS-5 5.20 compiler fails compiling with the following errors: "src/Core/arch/NEON/PacketMath.h", line 113: Error: #146: too many initializer values Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); ^ "src/Core/arch/NEON/PacketMath.h", line 118: Error: #146: too many initializer values Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); ^ "src/Core/arch/NEON/Complex.h", line 30: Error: #146: too many initializer values static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000); ^ "src/Core/arch/NEON/Complex.h", line 31: Error: #146: too many initializer values static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000); ^ The vectors are implemented as two doubles, hence the too many initializer values error. Changed the code to use intrinsic load functions which all compilers implementing NEON should have. --- Eigen/src/Core/arch/NEON/Complex.h | 5 +++-- Eigen/src/Core/arch/NEON/PacketMath.h | 15 +++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index d2d467936..234f29b80 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -14,8 +14,9 @@ namespace Eigen { namespace internal { -static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000); -static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000); +const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA ); +static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA ); //---------- float ---------- struct Packet2cf diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index deb2d7e42..fa16bc9c8 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -51,15 +51,12 @@ typedef uint32x4_t Packet4ui; #if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG //Special treatment for Apple's llvm-gcc, its NEON packet types are unions - #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}} - #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}} + #define EIGEN_INIT_NEON_PACKET2D(X, Y) {{X, Y}} #else //Default initializer for packets - #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y} - #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W} + #define EIGEN_INIT_NEON_PACKET2D(X, Y) {X, Y} #endif - // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function // which available on LLVM and GCC (at least) #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -122,12 +119,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); + const float32_t f[] = {0, 1, 2, 3}; + Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { - Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); + const int32_t i[] = {0, 1, 2, 3}; + Packet4i countdown = vld1q_s32(i); return vaddq_s32(pset1(a), countdown); } @@ -585,7 +584,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { r template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1); + Packet2d countdown = EIGEN_INIT_NEON_PACKET2D(0, 1); return vaddq_f64(pset1(a), countdown); } template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } From 9137f560f0c84470c7859a6db704bf5f18ae999d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 07:26:48 -0700 Subject: [PATCH 19/31] Moved assertions to the constructor to make the code more portable --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 14 ++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 16 ---------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 4e873011e..a48cb1daa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -426,6 +426,20 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 9509f8002..5f2e329f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -227,22 +227,6 @@ struct traits::type Scalar; - EIGEN_STATIC_ASSERT( - (internal::is_same::StorageKind, - typename traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same::StorageKind, - typename traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same::Index, - typename traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT( - (internal::is_same::Index, - typename traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) typedef traits XprTraits; typedef typename traits::StorageKind StorageKind; typedef typename traits::Index Index; From df24f4a01d762b0ae7dee8a1a0c769c96e4da835 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 6 Jun 2016 16:46:46 +0200 Subject: [PATCH 20/31] bug #1201: improve code generation of affine*vec with MSVC --- Eigen/src/Geometry/Transform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 4fc876bcf..1a06c1e35 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -1367,7 +1367,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); Matrix rhs; - rhs << other,1; + rhs.template head() = other; rhs[Dim] = 1; Matrix res(T.matrix() * rhs); return res.template head(); } From 33f0340188eb309dfa3b16c0758a878c3ea114d9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 12:06:42 -0700 Subject: [PATCH 21/31] Implement result_of for the new ternary functors --- Eigen/src/Core/util/Meta.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 7ecd59add..bd3a0aa5d 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -328,6 +328,30 @@ struct result_of { enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; typedef typename binary_result_of_select::type type; }; + +template +struct ternary_result_of_select {typedef typename internal::remove_all::type type;}; + +template +struct ternary_result_of_select +{typedef typename Func::result_type type;}; + +template +struct ternary_result_of_select +{typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename ternary_result_of_select::type type; +}; #endif /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer. From ea75dba2014ffa58acfcd160b5e59975c453f8da Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 13:32:28 -0700 Subject: [PATCH 22/31] Added missing EIGEN_DEVICE_FUNC qualifiers to the unary array ops --- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 775fa6ee0..9e35dc571 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -248,6 +248,7 @@ tan() const * * \sa tan(), asin(), acos() */ +EIGEN_DEVICE_FUNC inline const AtanReturnType atan() const { @@ -289,6 +290,7 @@ asin() const * * \sa tan(), sinh(), cosh() */ +EIGEN_DEVICE_FUNC inline const TanhReturnType tanh() const { @@ -302,6 +304,7 @@ tanh() const * * \sa sin(), tanh(), cosh() */ +EIGEN_DEVICE_FUNC inline const SinhReturnType sinh() const { @@ -315,6 +318,7 @@ sinh() const * * \sa tan(), sinh(), cosh() */ +EIGEN_DEVICE_FUNC inline const CoshReturnType cosh() const { @@ -332,6 +336,7 @@ cosh() const * * \sa digamma() */ +EIGEN_DEVICE_FUNC inline const LgammaReturnType lgamma() const { @@ -346,6 +351,7 @@ lgamma() const * * \sa Eigen::digamma(), Eigen::polygamma(), lgamma() */ +EIGEN_DEVICE_FUNC inline const DigammaReturnType digamma() const { @@ -364,6 +370,7 @@ digamma() const * * \sa erfc() */ +EIGEN_DEVICE_FUNC inline const ErfReturnType erf() const { @@ -382,6 +389,7 @@ erf() const * * \sa erf() */ +EIGEN_DEVICE_FUNC inline const ErfcReturnType erfc() const { @@ -455,6 +463,7 @@ cube() const * * \sa ceil(), floor() */ +EIGEN_DEVICE_FUNC inline const RoundReturnType round() const { @@ -468,6 +477,7 @@ round() const * * \sa ceil(), round() */ +EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { @@ -481,6 +491,7 @@ floor() const * * \sa floor(), round() */ +EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { @@ -494,6 +505,7 @@ ceil() const * * \sa isfinite(), isinf() */ +EIGEN_DEVICE_FUNC inline const IsNaNReturnType isNaN() const { @@ -507,6 +519,7 @@ isNaN() const * * \sa isnan(), isfinite() */ +EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { @@ -520,6 +533,7 @@ isInf() const * * \sa isnan(), isinf() */ +EIGEN_DEVICE_FUNC inline const IsFiniteReturnType isFinite() const { From 7ef9f47b5874c33d15649a3312d463ecbd290365 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 14:09:46 -0700 Subject: [PATCH 23/31] Misc small improvements to the reduction code. --- .../CXX11/src/Tensor/TensorReductionCuda.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 0d1a098b7..e82530955 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if (block == 0) { // We're the first block to run, initialize the output value atomicExch(output, reducer.initialize()); - unsigned int old = atomicExch(semaphore, 2u); - assert(old == 1u); + __threadfence(); + atomicExch(semaphore, 2u); } else { + // Wait for the first block to initialize the output value. // Use atomicCAS here to ensure that the reads aren't cached - unsigned int val = atomicCAS(semaphore, 2u, 2u); - while (val < 2u) { + unsigned int val; + do { val = atomicCAS(semaphore, 2u, 2u); } + while (val < 2u); } } } @@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } if (gridDim.x > 1 && threadIdx.x == 0) { - unsigned int ticket = atomicInc(semaphore, UINT_MAX); - assert(ticket >= 2u); - if (ticket == gridDim.x + 1) { - // We're the last block, reset the semaphore - *semaphore = 0; - } + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); } } From 84b2060a9e3e57e49bad6208873e52c327ad135b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Jun 2016 17:16:19 -0700 Subject: [PATCH 24/31] Fixed compilation error with gcc 4.4 --- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 031dbf6f2..61df8032d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device> { m_output(NULL) { // Accumulating a scalar isn't supported. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(m_axis >= 0 && m_axis < NumDims); // Compute stride of scan axis @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> { return true; } } - + template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { return internal::ploadt(m_output + index); From db0118342c2480ba0da7e8e4fbdd6aebaac687a4 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 7 Jun 2016 19:17:18 +0200 Subject: [PATCH 25/31] Fixed compilation of BVH_Example (required for make doc) --- unsupported/doc/examples/BVH_Example.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp index 6b6fac075..4315fb440 100644 --- a/unsupported/doc/examples/BVH_Example.cpp +++ b/unsupported/doc/examples/BVH_Example.cpp @@ -6,9 +6,7 @@ using namespace Eigen; typedef AlignedBox Box2d; namespace Eigen { - namespace internal { Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point - } } struct PointPointMinimizer //how to compute squared distances between points and rectangles From 86aedc9282f85966461544c40403979b861d3e19 Mon Sep 17 00:00:00 2001 From: Igor Babuschkin Date: Tue, 7 Jun 2016 20:06:38 +0100 Subject: [PATCH 26/31] Add small fixes to TensorScanOp --- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 61df8032d..66fcacd8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -81,7 +81,7 @@ struct TensorEvaluator, Device> { typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; @@ -152,6 +152,10 @@ struct TensorEvaluator, Device> { return m_output[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_output != NULL) { m_device.deallocate(m_output); From 002804938087acc829941c7500a5230fa5cf28b0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Jun 2016 23:08:11 +0200 Subject: [PATCH 27/31] bug #1240: Remove any assumption on NEON vector types. --- Eigen/src/Core/arch/NEON/Complex.h | 3 ++- Eigen/src/Core/arch/NEON/PacketMath.h | 11 ++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 234f29b80..ccc00e5a6 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -275,7 +275,8 @@ ptranspose(PacketBlock& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); +const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; +static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA ); struct Packet1cd { diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fa16bc9c8..e1247696d 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -49,14 +49,6 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG - //Special treatment for Apple's llvm-gcc, its NEON packet types are unions - #define EIGEN_INIT_NEON_PACKET2D(X, Y) {{X, Y}} -#else - //Default initializer for packets - #define EIGEN_INIT_NEON_PACKET2D(X, Y) {X, Y} -#endif - // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function // which available on LLVM and GCC (at least) #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -584,7 +576,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { r template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - Packet2d countdown = EIGEN_INIT_NEON_PACKET2D(0, 1); + const double countdown_raw[] = {0.0,1.0}; + const Packet2d countdown = vld1q_f64(countdown_raw); return vaddq_f64(pset1(a), countdown); } template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } From a05607875a57129821f81e6aae1727357f4db5e6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Jun 2016 11:53:56 -0700 Subject: [PATCH 28/31] Don't refer to the half2 type unless it's been defined --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 0f3778e6e..fdb5ee6b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) template <> struct PacketType { typedef half2 type; From 83904a21c11ffdb88f3ad8a65ded7bf46c1a068a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Jun 2016 14:41:36 +0200 Subject: [PATCH 29/31] Make sure T(i+1,i)==0 when diagonalizing T(i:i+1,i:i+1) --- Eigen/src/Eigenvalues/RealQZ.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index c4715b954..b3a910dd9 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -630,11 +630,11 @@ namespace Eigen { internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right); // Apply resulting Jacobi rotations - m_T.applyOnTheLeft(i,i+1,j_left); - m_T.applyOnTheRight(i,i+1,j_right); m_S.applyOnTheLeft(i,i+1,j_left); m_S.applyOnTheRight(i,i+1,j_right); - m_T(i,i+1) = Scalar(0); + m_T.applyOnTheLeft(i,i+1,j_left); + m_T.applyOnTheRight(i,i+1,j_right); + m_T(i+1,i) = m_T(i,i+1) = Scalar(0); if(m_computeQZ) { m_Q.applyOnTheRight(i,i+1,j_left.transpose()); From a3a4714abac02ba48a683c3c3967cebee2833188 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Jun 2016 14:41:53 +0200 Subject: [PATCH 30/31] Add debug output. --- test/real_qz.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/real_qz.cpp b/test/real_qz.cpp index a1766c6d9..45ae8d763 100644 --- a/test/real_qz.cpp +++ b/test/real_qz.cpp @@ -49,11 +49,20 @@ template void real_qz(const MatrixType& m) for (Index i=0; i