From d6d39c7ddb127d91ebfa4ea62e93ea51036f1760 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 7 Jun 2016 14:35:08 -0700
Subject: [PATCH 01/31] Added missing EIGEN_DEVICE_FUNC

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 52cfc2824..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;

From 8fd57a97f203edac3f7e8681eafe752294386a24 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 7 Jun 2016 18:22:18 -0700
Subject: [PATCH 02/31] Enable the vectorization of adds and mults of fp16

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 51386506f..959dff886 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -28,6 +28,8 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
     AlignedOnScalar = 1,
     size=2,
     HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasMul    = 1,
     HasDiv    = 1,
     HasSqrt   = 1,
     HasRsqrt  = 1,

From 9dd9d58273362d643eaa0b8f4f16f8aa3d5ef6cd Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Wed, 8 Jun 2016 15:36:42 +0200
Subject: [PATCH 03/31] Copied a regression test from 3.2 branch.

---
 test/geo_homogeneous.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index bf63c69ec..305794cdf 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -58,6 +58,8 @@ template<typename Scalar,int Size> void homogeneous(void)
   T2MatrixType t2 = T2MatrixType::Random();
   VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous());
   VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous());
+  VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal());
+  VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2);
 
   VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2,
                     v0.transpose().rowwise().homogeneous() * t2);

From 9fc8379328dad5fb74249003f56fb608c304ae4d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 8 Jun 2016 16:39:11 +0200
Subject: [PATCH 04/31] Fix extraction of complex eigenvalue pairs in real
 generalized eigenvalue problems.

---
 .../src/Eigenvalues/GeneralizedEigenSolver.h  | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index a9d6790d5..07a9ccf46 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -327,13 +327,33 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
       }
       else
       {
-        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a triangular 2x2 block T
+        // From the eigen decomposition of T = U * E * U^-1,
+        // we can extract the eigenvalues of (U^-1 * S * U) / E
+        // Here, we can take advantage that E = diag(T), and U = [ 1 T_01 ; 0 T_11-T_00], and U^-1 = [1 -T_11/(T_11-T_00) ; 0 1/(T_11-T_00)].
+        // Then taking beta=T_00*T_11*(T_11-T_00), we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00) * (T_11-T_00):
+
+        // T =  [a b ; 0 c]
+        // S =  [e f ; g h]
+        RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i, i+1), c = m_realQZ.matrixT().coeff(i+1, i+1);
+        RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1);
+        RealScalar d = c-a;
+        RealScalar gb = g*b;
+        Matrix<RealScalar,2,2> A;
+        A <<  (e*d-gb)*c,  ((e*b+f*d-h*b)*d-gb*b)*a,
+              g*c       ,  (gb+h*d)*a;
+
+        // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal,
+        // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00):
+
+        Scalar p = Scalar(0.5) * (A.coeff(i, i) - A.coeff(i+1, i+1));
+        Scalar z = sqrt(abs(p * p + A.coeff(i+1, i) * A.coeff(i, i+1)));
+        m_alphas.coeffRef(i)   = ComplexScalar(A.coeff(i+1, i+1) + p, z);
+        m_alphas.coeffRef(i+1) = ComplexScalar(A.coeff(i+1, i+1) + p, -z);
+
+        m_betas.coeffRef(i)   =
+        m_betas.coeffRef(i+1) = a*c*d;
 
-        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
-        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
         i += 2;
       }
     }

From df095cab104550a8179c28e93d477406dfab6849 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 8 Jun 2016 18:31:19 +0200
Subject: [PATCH 05/31] Fixes for PARDISO: warnings, and defaults to metis+
 in-core mode.

---
 Eigen/src/PardisoSupport/PardisoSupport.h | 35 ++++++++++++++---------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 80d914f25..091c3970e 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -183,7 +183,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     {
       if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0,
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, internal::convert_index<StorageIndex>(m_size),0, 0, 0, m_perm.data(), 0,
                                                           m_iparm.data(), m_msglvl, NULL, NULL);
         m_isInitialized = false;
       }
@@ -194,11 +194,11 @@ class PardisoImpl : public SparseSolverBase<Derived>
       m_type = type;
       bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
-      m_iparm[1] = 3;   // use Metis for the ordering
-      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
+      m_iparm[1] = 2;   // use Metis for the ordering
+      m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
       m_iparm[3] = 0;   // No iterative-direct algorithm
       m_iparm[4] = 0;   // No user fill-in reducing permutation
-      m_iparm[5] = 0;   // Write solution into x
+      m_iparm[5] = 0;   // Write solution into x, b is left unchanged
       m_iparm[6] = 0;   // Not in use
       m_iparm[7] = 2;   // Max numbers of iterative refinement steps
       m_iparm[8] = 0;   // Not in use
@@ -219,7 +219,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
       m_iparm[26] = 0;  // No matrix checker
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
-      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      m_iparm[36] = 0;  // CSR
+      m_iparm[59] = 0;  // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
       
       memset(m_pt, 0, sizeof(m_pt));
     }
@@ -246,7 +247,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     mutable SparseMatrixType m_matrix;
     mutable ComputationInfo m_info;
     bool m_analysisIsOk, m_factorizationIsOk;
-    Index m_type, m_msglvl;
+    StorageIndex m_type, m_msglvl;
     mutable void *m_pt[64];
     mutable ParameterType m_iparm;
     mutable IntColVectorType m_perm;
@@ -265,10 +266,9 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = true;
@@ -287,7 +287,7 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
@@ -306,8 +306,8 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
   
   derived().getMatrix(a);
 
-  Index error;  
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, m_size,
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
@@ -354,9 +354,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
   }
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                            m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
+                                                            m_perm.data(), internal::convert_index<StorageIndex>(nrhs), m_iparm.data(), m_msglvl,
                                                             rhs_ptr, x.derived().data());
 
   manageErrorCode(error);
@@ -371,6 +371,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
   * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
   * \implsparsesolverconcept
@@ -421,6 +424,9 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
   * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
@@ -480,6 +486,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.

From 0beabb4776b887c25977186b5af6811eb49aaa23 Mon Sep 17 00:00:00 2001
From: Abhijit Kundu <abhijit.dgp@gmail.com>
Date: Wed, 8 Jun 2016 16:12:04 -0400
Subject: [PATCH 06/31] Fixed type conversion from int

---
 Eigen/src/Geometry/Transform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 1a06c1e35..073f4dcd1 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1367,7 +1367,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
     Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
-    rhs.template head<Dim>() = other; rhs[Dim] = 1;
+    rhs.template head<Dim>() = other; rhs[Dim] = typename ResultType::Scalar(1);
     Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
     return res.template head<Dim>();
   }

From a20d2ec1c02d2629cbfa8871898b64cd22445595 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 16:16:22 +0200
Subject: [PATCH 07/31] Fix shadow variable, and indexing.

---
 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 07a9ccf46..08caed281 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -339,17 +339,17 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
         RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1);
         RealScalar d = c-a;
         RealScalar gb = g*b;
-        Matrix<RealScalar,2,2> A;
-        A <<  (e*d-gb)*c,  ((e*b+f*d-h*b)*d-gb*b)*a,
-              g*c       ,  (gb+h*d)*a;
+        Matrix<RealScalar,2,2> S2;
+        S2 <<  (e*d-gb)*c,  ((e*b+f*d-h*b)*d-gb*b)*a,
+               g*c       ,  (gb+h*d)*a;
 
         // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal,
         // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00):
 
-        Scalar p = Scalar(0.5) * (A.coeff(i, i) - A.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + A.coeff(i+1, i) * A.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(A.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(A.coeff(i+1, i+1) + p, -z);
+        Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
+        Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
+        m_alphas.coeffRef(i)   = ComplexScalar(S2.coeff(1,1) + p,  z);
+        m_alphas.coeffRef(i+1) = ComplexScalar(S2.coeff(1,1) + p, -z);
 
         m_betas.coeffRef(i)   =
         m_betas.coeffRef(i+1) = a*c*d;

From 15890c304edbccedc8a989468ed3fc475f428059 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 16:17:27 +0200
Subject: [PATCH 08/31] Add unit test for non symmetric generalized eigenvalues

---
 test/eigensolver_generalized_real.cpp | 32 ++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index a46a2e50e..da14482de 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,6 +10,7 @@
 #include "main.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
+#include <Eigen/LU>
 
 template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
 {
@@ -21,6 +22,7 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   Index cols = m.cols();
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef std::complex<Scalar> ComplexScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
 
   MatrixType a = MatrixType::Random(rows,cols);
@@ -31,14 +33,28 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   MatrixType spdB =  b.adjoint() * b + b1.adjoint() * b1;
 
   // lets compare to GeneralizedSelfAdjointEigenSolver
-  GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
-  GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
+    GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
 
-  VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
+    VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
 
-  VectorType realEigenvalues = eig.eigenvalues().real();
-  std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
-  VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+    VectorType realEigenvalues = eig.eigenvalues().real();
+    std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
+    VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+  }
+
+  // non symmetric case:
+  {
+    GeneralizedEigenSolver<MatrixType> eig(a,b);
+    for(Index k=0; k<cols; ++k)
+    {
+      Matrix<ComplexScalar,Dynamic,Dynamic> tmp = (eig.betas()(k)*a).template cast<ComplexScalar>() - eig.alphas()(k)*b;
+      if(tmp.norm()>(std::numeric_limits<Scalar>::min)())
+        tmp /= tmp.norm();
+      VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) );
+    }
+  }
 
   // regression test for bug 1098
   {
@@ -57,7 +73,7 @@ void test_eigensolver_generalized_real()
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) );
 
-    // some trivial but implementation-wise tricky cases
+    // some trivial but implementation-wise special cases
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) );
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
     CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );

From c1f9ca925405c8fad126f327b4bdca7f983fc96e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 17:11:03 +0200
Subject: [PATCH 09/31] Update RealQZ to reduce 2x2 diagonal block of T
 corresponding to non reduced diagonal block of S to positive diagonal form.
 This step involve a real 2x2 SVD problem. The respective routine is thus in
 src/misc/ to be shared by both EVD and AVD modules.

---
 Eigen/Eigenvalues              |  1 +
 Eigen/SVD                      |  1 +
 Eigen/src/Eigenvalues/RealQZ.h | 32 +++++++++++++++++++-
 Eigen/src/SVD/JacobiSVD.h      | 32 --------------------
 Eigen/src/misc/RealSvd2x2.h    | 54 ++++++++++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 33 deletions(-)
 create mode 100644 Eigen/src/misc/RealSvd2x2.h

diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index ea93eb303..216a6d51d 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -32,6 +32,7 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
diff --git a/Eigen/SVD b/Eigen/SVD
index b353f3f54..565d9c90d 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -31,6 +31,7 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index a62071d42..c4715b954 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -552,7 +552,6 @@ namespace Eigen {
       m_T.coeffRef(l,l-1) = Scalar(0.0);
     }
 
-
   template<typename MatrixType>
     RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
     {
@@ -616,6 +615,37 @@ namespace Eigen {
       }
       // check if we converged before reaching iterations limit
       m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
+
+      // For each non triangular 2x2 diagonal block of S,
+      //    reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
+      // This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
+      // and is in par with Lapack/Matlab QZ.
+      if(m_info==Success)
+      {
+        for(Index i=0; i<dim-1; ++i)
+        {
+          if(m_S.coeff(i+1, i) != Scalar(0))
+          {
+            JacobiRotation<Scalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
+
+            // Apply resulting Jacobi rotations
+            m_T.applyOnTheLeft(i,i+1,j_left);
+            m_T.applyOnTheRight(i,i+1,j_right);
+            m_S.applyOnTheLeft(i,i+1,j_left);
+            m_S.applyOnTheRight(i,i+1,j_right);
+            m_T(i,i+1) = Scalar(0);
+
+            if(m_computeQZ) {
+              m_Q.applyOnTheRight(i,i+1,j_left.transpose());
+              m_Z.applyOnTheLeft(i,i+1,j_right.transpose());
+            }
+
+            i++;
+          }
+        }
+      }
+
       return *this;
     } // end compute
 
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1940c8294..b83fd7a4d 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -419,38 +419,6 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
   }
 };
 
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                         JacobiRotation<RealScalar> *j_left,
-                         JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  using std::abs;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(d == RealScalar(0))
-  {
-    rot1.s() = RealScalar(0);
-    rot1.c() = RealScalar(1);
-  }
-  else
-  {
-    // If d!=0, then t/d cannot overflow because the magnitude of the
-    // entries forming d are not too small compared to the ones forming t.
-    RealScalar u = t / d;
-    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = RealScalar(1) / tmp;
-    rot1.c() = u / tmp;
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left = rot1 * j_right->transpose();
-}
-
 template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
 {
diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h
new file mode 100644
index 000000000..cdd7777d2
--- /dev/null
+++ b/Eigen/src/misc/RealSvd2x2.h
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALSVD2X2_H
+#define EIGEN_REALSVD2X2_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                         JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right)
+{
+  using std::sqrt;
+  using std::abs;
+  Matrix<RealScalar,2,2> m;
+  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
+       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
+  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
+  if(d == RealScalar(0))
+  {
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
+  }
+  else
+  {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
+  }
+  m.applyOnTheLeft(0,1,rot1);
+  j_right->makeJacobi(m,0,1);
+  *j_left = rot1 * j_right->transpose();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_REALSVD2X2_H
\ No newline at end of file

From 2bd59b0e0d667dcdcb6b070596a1bf023e3e88f1 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 17:12:03 +0200
Subject: [PATCH 10/31] Take advantage that T is already diagonal in the
 extraction of generalized complex eigenvalues.

---
 .../src/Eigenvalues/GeneralizedEigenSolver.h  | 25 ++++++-------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 08caed281..9f43fd544 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -327,24 +327,13 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
       }
       else
       {
-        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a triangular 2x2 block T
-        // From the eigen decomposition of T = U * E * U^-1,
-        // we can extract the eigenvalues of (U^-1 * S * U) / E
-        // Here, we can take advantage that E = diag(T), and U = [ 1 T_01 ; 0 T_11-T_00], and U^-1 = [1 -T_11/(T_11-T_00) ; 0 1/(T_11-T_00)].
-        // Then taking beta=T_00*T_11*(T_11-T_00), we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00) * (T_11-T_00):
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T
+        // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00):
 
-        // T =  [a b ; 0 c]
-        // S =  [e f ; g h]
-        RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i, i+1), c = m_realQZ.matrixT().coeff(i+1, i+1);
-        RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1);
-        RealScalar d = c-a;
-        RealScalar gb = g*b;
-        Matrix<RealScalar,2,2> S2;
-        S2 <<  (e*d-gb)*c,  ((e*b+f*d-h*b)*d-gb*b)*a,
-               g*c       ,  (gb+h*d)*a;
-
-        // NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal,
-        // and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00):
+        // T =  [a 0]
+        //      [0 b]
+        RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i+1, i+1);
+        Matrix<RealScalar,2,2> S2 = m_matS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();
 
         Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
         Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
@@ -352,7 +341,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
         m_alphas.coeffRef(i+1) = ComplexScalar(S2.coeff(1,1) + p, -z);
 
         m_betas.coeffRef(i)   =
-        m_betas.coeffRef(i+1) = a*c*d;
+        m_betas.coeffRef(i+1) = a*b;
 
         i += 2;
       }

From e2b383632699684d06ae180b3ad85cfb0189973a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 17:13:33 +0200
Subject: [PATCH 11/31] Include recent changesets that played with product's
 kernel

---
 bench/perf_monitoring/gemm/changesets.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index fb3e48e99..d00b4603a 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -44,4 +44,8 @@ before-evaluators
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
+8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
 

From aa33446dace833fbf06632e586c80119b3d8ac11 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Jun 2016 08:22:27 -0700
Subject: [PATCH 12/31] Improved support for vectorization of 16-bit floats

---
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   | 86 +++++++++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorMeta.h       |  5 ++
 .../CXX11/src/Tensor/TensorReductionCuda.h    |  8 +-
 3 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..bf52e490f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };
 
 
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
     DenseIndex packetCount_;
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMin
+  };
+};
+
 
 template <typename T> struct ProdReducer
 {
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::type::HasMul
+  };
+};
+
 
 struct AndReducer
 {
@@ -280,6 +331,15 @@ struct AndReducer
   }
 };
 
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
   static const bool PacketAccess = false;
   static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
   }
 };
 
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
   static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
 
 // Random number generation
 namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..82a905a65 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -54,6 +54,11 @@ struct PacketType {
 
 // For CUDA packet types when using a GpuDevice
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <>
+  struct PacketType<half, GpuDevice> {
+  typedef half2 type;
+  static const int size = 2;
+ };
 template <>
 struct PacketType<float, GpuDevice> {
   typedef float4 type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index e82530955..1b4fdd03f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -331,7 +331,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -346,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
       return;
     }
 
-    FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
   }
 };
 
@@ -608,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -627,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
       return true;
     }
 
-    return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
   }
 };
 

From 8f92c26319a6e06cce6d0ba2a252521c8096a2c0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Jun 2016 08:23:42 -0700
Subject: [PATCH 13/31] Improved code formatting

---
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 61df8032d..0d084141d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-      return m_dimensions;
+    return m_dimensions;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {

From 14a112ee153f7d8554c3a8848e8a0461c7b82f13 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Jun 2016 08:25:22 -0700
Subject: [PATCH 14/31] Use signed integers more consistently to encode the
 number of threads to use to evaluate a tensor expression.

---
 .../CXX11/src/Tensor/TensorContractionThreadPool.h   | 12 ++++++------
 .../Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h  | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // across k dimension.
     const TensorOpCost cost =
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-    Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   class Context {
    public:
     Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
             Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
             Index gn, Index nm0, Index nn0, bool shard_by_col,
             bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           lhs_(lhs),
           rhs_(rhs),
           buffer_(buffer),
-          output_(buffer, m),
+          output_(buffer, tm),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
-          m_(m),
-          n_(n),
-          k_(k),
+          m_(tm),
+          n_(tn),
+          k_(tk),
           bm_(bm),
           bn_(bn),
           bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..c770d024f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
 
@@ -182,7 +182,7 @@ struct ThreadPoolDevice {
                    std::function<void(Index, Index)> f) const {
     typedef TensorCostModel<ThreadPoolDevice> CostModel;
     if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, numThreads()) == 1) {
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
       return;
     }
@@ -242,7 +242,7 @@ struct ThreadPoolDevice {
     // Recursively divide size into halves until we reach block_size.
     // Division code rounds mid to block_size, so we are guaranteed to get
     // block_count leaves that do actual computations.
-    Barrier barrier(block_count);
+    Barrier barrier(static_cast<unsigned int>(block_count));
     std::function<void(Index, Index)> handleRange;
     handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
       if (last - first <= block_size) {
@@ -268,7 +268,7 @@ struct ThreadPoolDevice {
 
  private:
   ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };
 
 

From 66796e843df723eeac04d6dc725f6a8b27a574ba Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Jun 2016 08:50:01 -0700
Subject: [PATCH 15/31] Fixed definition of some of the reducer_traits

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index bf52e490f..e6ff70460 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -131,7 +131,7 @@ template <typename T, typename Device>
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::type::HasAdd
+    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasAdd
   };
 };
 
@@ -183,7 +183,7 @@ template <typename T, typename Device>
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::type::HasAdd
+    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasAdd
   };
 };
 
@@ -225,7 +225,7 @@ template <typename T, typename Device>
 struct reducer_traits<MaxReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::type::HasMax
+    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMax
   };
 };
 
@@ -267,7 +267,7 @@ template <typename T, typename Device>
 struct reducer_traits<MinReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::type::HasMin
+    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMin
   };
 };
 
@@ -310,7 +310,7 @@ template <typename T, typename Device>
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::type::HasMul
+    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMul
   };
 };
 

From 37638dafd71e39407d22d4269b32d1c73b84feb8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Jun 2016 10:29:52 -0700
Subject: [PATCH 16/31] Simplified the code that dispatches vectorized
 reductions on GPU

---
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   | 10 ++---
 .../Eigen/CXX11/src/Tensor/TensorMeta.h       | 38 ++++++++++++-------
 .../CXX11/src/Tensor/TensorReductionCuda.h    |  2 +-
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index e6ff70460..a8e48fced 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -131,7 +131,7 @@ template <typename T, typename Device>
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd
   };
 };
 
@@ -183,7 +183,7 @@ template <typename T, typename Device>
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd
   };
 };
 
@@ -225,7 +225,7 @@ template <typename T, typename Device>
 struct reducer_traits<MaxReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMax
+    PacketAccess = PacketType<T, Device>::HasMax
   };
 };
 
@@ -267,7 +267,7 @@ template <typename T, typename Device>
 struct reducer_traits<MinReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMin
+    PacketAccess = PacketType<T, Device>::HasMin
   };
 };
 
@@ -310,7 +310,7 @@ template <typename T, typename Device>
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = packet_traits<typename PacketType<T, Device>::type>::HasMul
+    PacketAccess = PacketType<T, Device>::HasMul
   };
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 82a905a65..0f3778e6e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,27 +47,39 @@ template <> struct max_n_1<0> {
 
 // Default packet types
 template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
   typedef typename internal::packet_traits<Scalar>::type type;
-  enum { size = internal::unpacket_traits<type>::size };
 };
 
 // For CUDA packet types when using a GpuDevice
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <>
-  struct PacketType<half, GpuDevice> {
+struct PacketType<half, GpuDevice> {
   typedef half2 type;
   static const int size = 2;
- };
-template <>
-struct PacketType<float, GpuDevice> {
-  typedef float4 type;
-  static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
-  typedef double2 type;
-  static const int size = 2;
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0,
+
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 1,
+  };
 };
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 1b4fdd03f..d9bbcd858 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -328,7 +328,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats and half floats.
-  #ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));

From 1fc2746417c8b4bf1645c703b1f99b1871c8d16e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 22:52:37 +0200
Subject: [PATCH 17/31] Make Arrays's ctor/assignment noexcept

---
 Eigen/src/Core/Array.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index c0af4aa9d..7c2e0de16 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -149,7 +149,7 @@ class Array
 
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    Array(Array&& other)
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
@@ -157,7 +157,7 @@ class Array
         Base::_set_noalias(other);
     }
     EIGEN_DEVICE_FUNC
-    Array& operator=(Array&& other)
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
       other.swap(*this);
       return *this;

From bd212438217dc3e169a35052f78e2e41a7ce3a3d Mon Sep 17 00:00:00 2001
From: Sean Templeton <seantempleton@outlook.com>
Date: Fri, 3 Jun 2016 10:51:35 -0500
Subject: [PATCH 18/31] Fix compile errors initializing packets on ARM DS-5
 5.20

The ARM DS-5 5.20 compiler fails compiling with the following errors:

"src/Core/arch/NEON/PacketMath.h", line 113: Error:  #146: too many initializer values
    Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
                         ^
"src/Core/arch/NEON/PacketMath.h", line 118: Error:  #146: too many initializer values
    Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
                         ^
"src/Core/arch/NEON/Complex.h", line 30: Error:  #146: too many initializer values
  static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
                                    ^
"src/Core/arch/NEON/Complex.h", line 31: Error:  #146: too many initializer values
  static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
                                    ^

The vectors are implemented as two doubles, hence the too many initializer values error.
Changed the code to use intrinsic load functions which all compilers
implementing NEON should have.
---
 Eigen/src/Core/arch/NEON/Complex.h    |  5 +++--
 Eigen/src/Core/arch/NEON/PacketMath.h | 15 +++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index d2d467936..234f29b80 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -14,8 +14,9 @@ namespace Eigen {
 
 namespace internal {
 
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+const uint32_t  conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );
+static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );
 
 //---------- float ----------
 struct Packet2cf
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index deb2d7e42..fa16bc9c8 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -51,15 +51,12 @@ typedef uint32x4_t  Packet4ui;
 
 #if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
   //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
+  #define EIGEN_INIT_NEON_PACKET2D(X, Y)       {{X, Y}}
 #else
   //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
+  #define EIGEN_INIT_NEON_PACKET2D(X, Y)       {X, Y}
 #endif
 
-
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -122,12 +119,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
 
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const float32_t f[] = {0, 1, 2, 3};
+  Packet4f countdown = vld1q_f32(f);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
 {
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const int32_t i[] = {0, 1, 2, 3};
+  Packet4i countdown = vld1q_s32(i);
   return vaddq_s32(pset1<Packet4i>(a), countdown);
 }
 
@@ -585,7 +584,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { r
 
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
+  Packet2d countdown = EIGEN_INIT_NEON_PACKET2D(0, 1);
   return vaddq_f64(pset1<Packet2d>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }

From 9137f560f0c84470c7859a6db704bf5f18ae999d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Jun 2016 07:26:48 -0700
Subject: [PATCH 19/31] Moved assertions to the constructor to make the code
 more portable

---
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h     | 14 ++++++++++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h  | 16 ----------------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 4e873011e..a48cb1daa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -426,6 +426,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
       m_arg3Impl(op.arg3Expression(), device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
     eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 9509f8002..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -227,22 +227,6 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
       TernaryOp(typename Arg1XprType::Scalar,
                 typename Arg2XprType::Scalar,
                 typename Arg3XprType::Scalar)>::type Scalar;
-  EIGEN_STATIC_ASSERT(
-      (internal::is_same<typename traits<Arg1XprType>::StorageKind,
-                         typename traits<Arg2XprType>::StorageKind>::value),
-      STORAGE_KIND_MUST_MATCH)
-  EIGEN_STATIC_ASSERT(
-      (internal::is_same<typename traits<Arg1XprType>::StorageKind,
-                         typename traits<Arg3XprType>::StorageKind>::value),
-      STORAGE_KIND_MUST_MATCH)
-  EIGEN_STATIC_ASSERT(
-      (internal::is_same<typename traits<Arg1XprType>::Index,
-                         typename traits<Arg2XprType>::Index>::value),
-      STORAGE_INDEX_MUST_MATCH)
-  EIGEN_STATIC_ASSERT(
-      (internal::is_same<typename traits<Arg1XprType>::Index,
-                         typename traits<Arg3XprType>::Index>::value),
-      STORAGE_INDEX_MUST_MATCH)
   typedef traits<Arg1XprType> XprTraits;
   typedef typename traits<Arg1XprType>::StorageKind StorageKind;
   typedef typename traits<Arg1XprType>::Index Index;

From df24f4a01d762b0ae7dee8a1a0c769c96e4da835 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 6 Jun 2016 16:46:46 +0200
Subject: [PATCH 20/31] bug #1201: improve code generation of affine*vec with
 MSVC

---
 Eigen/src/Geometry/Transform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 4fc876bcf..1a06c1e35 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -1367,7 +1367,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
     Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
-    rhs << other,1;
+    rhs.template head<Dim>() = other; rhs[Dim] = 1;
     Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
     return res.template head<Dim>();
   }

From 33f0340188eb309dfa3b16c0758a878c3ea114d9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Jun 2016 12:06:42 -0700
Subject: [PATCH 21/31] Implement result_of for the new ternary functors

---
 Eigen/src/Core/util/Meta.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 7ecd59add..bd3a0aa5d 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -328,6 +328,30 @@ struct result_of<Func(ArgType0,ArgType1)> {
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
 };
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
+struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
+};
 #endif
 
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.

From ea75dba2014ffa58acfcd160b5e59975c453f8da Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Jun 2016 13:32:28 -0700
Subject: [PATCH 22/31] Added missing EIGEN_DEVICE_FUNC qualifiers to the unary
 array ops

---
 Eigen/src/plugins/ArrayCwiseUnaryOps.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 775fa6ee0..9e35dc571 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -248,6 +248,7 @@ tan() const
   *
   * \sa tan(), asin(), acos()
   */
+EIGEN_DEVICE_FUNC
 inline const AtanReturnType
 atan() const
 {
@@ -289,6 +290,7 @@ asin() const
   *
   * \sa tan(), sinh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const TanhReturnType
 tanh() const
 {
@@ -302,6 +304,7 @@ tanh() const
   *
   * \sa sin(), tanh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const SinhReturnType
 sinh() const
 {
@@ -315,6 +318,7 @@ sinh() const
   *
   * \sa tan(), sinh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const CoshReturnType
 cosh() const
 {
@@ -332,6 +336,7 @@ cosh() const
   *
   * \sa digamma()
   */
+EIGEN_DEVICE_FUNC
 inline const LgammaReturnType
 lgamma() const
 {
@@ -346,6 +351,7 @@ lgamma() const
   *
   * \sa Eigen::digamma(), Eigen::polygamma(), lgamma()
   */
+EIGEN_DEVICE_FUNC
 inline const DigammaReturnType
 digamma() const
 {
@@ -364,6 +370,7 @@ digamma() const
   *
   * \sa erfc()
   */
+EIGEN_DEVICE_FUNC
 inline const ErfReturnType
 erf() const
 {
@@ -382,6 +389,7 @@ erf() const
   *
   * \sa erf()
   */
+EIGEN_DEVICE_FUNC
 inline const ErfcReturnType
 erfc() const
 {
@@ -455,6 +463,7 @@ cube() const
   *
   * \sa ceil(), floor()
   */
+EIGEN_DEVICE_FUNC
 inline const RoundReturnType
 round() const
 {
@@ -468,6 +477,7 @@ round() const
   *
   * \sa ceil(), round()
   */
+EIGEN_DEVICE_FUNC
 inline const FloorReturnType
 floor() const
 {
@@ -481,6 +491,7 @@ floor() const
   *
   * \sa floor(), round()
   */
+EIGEN_DEVICE_FUNC
 inline const CeilReturnType
 ceil() const
 {
@@ -494,6 +505,7 @@ ceil() const
   *
   * \sa isfinite(), isinf()
   */
+EIGEN_DEVICE_FUNC
 inline const IsNaNReturnType
 isNaN() const
 {
@@ -507,6 +519,7 @@ isNaN() const
   *
   * \sa isnan(), isfinite()
   */
+EIGEN_DEVICE_FUNC
 inline const IsInfReturnType
 isInf() const
 {
@@ -520,6 +533,7 @@ isInf() const
   *
   * \sa isnan(), isinf()
   */
+EIGEN_DEVICE_FUNC
 inline const IsFiniteReturnType
 isFinite() const
 {

From 7ef9f47b5874c33d15649a3312d463ecbd290365 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Jun 2016 14:09:46 -0700
Subject: [PATCH 23/31] Misc small improvements to the reduction code.

---
 .../CXX11/src/Tensor/TensorReductionCuda.h     | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 0d1a098b7..e82530955 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
       if (block == 0) {
         // We're the first block to run, initialize the output value
         atomicExch(output, reducer.initialize());
-        unsigned int old = atomicExch(semaphore, 2u);
-        assert(old == 1u);
+        __threadfence();
+        atomicExch(semaphore, 2u);
       }
       else {
+        // Wait for the first block to initialize the output value.
         // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val = atomicCAS(semaphore, 2u, 2u);
-        while (val < 2u) {
+        unsigned int val;
+        do {
           val = atomicCAS(semaphore, 2u, 2u);
         }
+        while (val < 2u);
       }
     }
   }
@@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
   }
 
   if (gridDim.x > 1 && threadIdx.x == 0) {
-    unsigned int ticket = atomicInc(semaphore, UINT_MAX);
-    assert(ticket >= 2u);
-    if (ticket == gridDim.x + 1) {
-      // We're the last block, reset the semaphore
-      *semaphore = 0;
-    }
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
   }
 }
 

From 84b2060a9e3e57e49bad6208873e52c327ad135b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Jun 2016 17:16:19 -0700
Subject: [PATCH 24/31] Fixed compilation error with gcc 4.4

---
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 031dbf6f2..61df8032d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
         m_output(NULL) {
 
     // Accumulating a scalar isn't supported.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(m_axis >= 0 && m_axis < NumDims);
 
     // Compute stride of scan axis
@@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
       return true;
     }
   }
-  
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);

From db0118342c2480ba0da7e8e4fbdd6aebaac687a4 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Tue, 7 Jun 2016 19:17:18 +0200
Subject: [PATCH 25/31] Fixed compilation of BVH_Example (required for make
 doc)

---
 unsupported/doc/examples/BVH_Example.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
index 6b6fac075..4315fb440 100644
--- a/unsupported/doc/examples/BVH_Example.cpp
+++ b/unsupported/doc/examples/BVH_Example.cpp
@@ -6,9 +6,7 @@ using namespace Eigen;
 typedef AlignedBox<double, 2> Box2d;
 
 namespace Eigen {
-    namespace internal {
         Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
-    }
 }
 
 struct PointPointMinimizer //how to compute squared distances between points and rectangles

From 86aedc9282f85966461544c40403979b861d3e19 Mon Sep 17 00:00:00 2001
From: Igor Babuschkin <igor@babuschk.in>
Date: Tue, 7 Jun 2016 20:06:38 +0100
Subject: [PATCH 26/31] Add small fixes to TensorScanOp

---
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 61df8032d..66fcacd8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -81,7 +81,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
@@ -152,6 +152,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_output[index];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     if (m_output != NULL) {
       m_device.deallocate(m_output);

From 002804938087acc829941c7500a5230fa5cf28b0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 9 Jun 2016 23:08:11 +0200
Subject: [PATCH 27/31] bug #1240: Remove any assumption on NEON vector types.

---
 Eigen/src/Core/arch/NEON/Complex.h    |  3 ++-
 Eigen/src/Core/arch/NEON/PacketMath.h | 11 ++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 234f29b80..ccc00e5a6 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -275,7 +275,8 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
-static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
+static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
 
 struct Packet1cd
 {
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index fa16bc9c8..e1247696d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -49,14 +49,6 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
-  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2D(X, Y)       {{X, Y}}
-#else
-  //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2D(X, Y)       {X, Y}
-#endif
-
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -584,7 +576,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { r
 
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  Packet2d countdown = EIGEN_INIT_NEON_PACKET2D(0, 1);
+  const double countdown_raw[] = {0.0,1.0};
+  const Packet2d countdown = vld1q_f64(countdown_raw);
   return vaddq_f64(pset1<Packet2d>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }

From a05607875a57129821f81e6aae1727357f4db5e6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 10 Jun 2016 11:53:56 -0700
Subject: [PATCH 28/31] Don't refer to the half2 type unless it's been defined

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 0f3778e6e..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
 template <>
 struct PacketType<half, GpuDevice> {
   typedef half2 type;

From 83904a21c11ffdb88f3ad8a65ded7bf46c1a068a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 11 Jun 2016 14:41:36 +0200
Subject: [PATCH 29/31] Make sure T(i+1,i)==0 when diagonalizing T(i:i+1,i:i+1)

---
 Eigen/src/Eigenvalues/RealQZ.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index c4715b954..b3a910dd9 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -630,11 +630,11 @@ namespace Eigen {
             internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
 
             // Apply resulting Jacobi rotations
-            m_T.applyOnTheLeft(i,i+1,j_left);
-            m_T.applyOnTheRight(i,i+1,j_right);
             m_S.applyOnTheLeft(i,i+1,j_left);
             m_S.applyOnTheRight(i,i+1,j_right);
-            m_T(i,i+1) = Scalar(0);
+            m_T.applyOnTheLeft(i,i+1,j_left);
+            m_T.applyOnTheRight(i,i+1,j_right);
+            m_T(i+1,i) = m_T(i,i+1) = Scalar(0);
 
             if(m_computeQZ) {
               m_Q.applyOnTheRight(i,i+1,j_left.transpose());

From a3a4714abac02ba48a683c3c3967cebee2833188 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 11 Jun 2016 14:41:53 +0200
Subject: [PATCH 30/31] Add debug output.

---
 test/real_qz.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/real_qz.cpp b/test/real_qz.cpp
index a1766c6d9..45ae8d763 100644
--- a/test/real_qz.cpp
+++ b/test/real_qz.cpp
@@ -49,11 +49,20 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
   for (Index i=0; i<A.cols(); i++)
     for (Index j=0; j<i; j++) {
       if (abs(qz.matrixT()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: T(" << i << "," << j << ") = " << qz.matrixT()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j)  << " && S(" << i-1 << "," << j-1 << ") = " << qz.matrixS()(i-1,j-1) << std::endl;
         all_zeros = false;
+      }
     }
   VERIFY_IS_EQUAL(all_zeros, true);
   VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A);

From 4c61f00838202889045ec9e5ad0d60b79f00fec5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 12 Jun 2016 22:42:13 +0200
Subject: [PATCH 31/31] Add missing explicit scalar conversion

---
 Eigen/src/Eigenvalues/Tridiagonalization.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 2030b5be1..1d102c17b 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -367,10 +367,10 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
     hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
                                   * (conj(h) * matA.col(i).tail(remainingSize)));
 
-    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+    hCoeffs.tail(n-i-1) += (conj(h)*RealScalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
 
     matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
-      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
 
     matA.col(i).coeffRef(i+1) = beta;
     hCoeffs.coeffRef(i) = h;