Merge upstream changes

2025-05-09 14:29:05 +08:00 · 2016-08-05 14:34:57 +01:00 · 2016-08-05 14:34:57 +01:00 · 0425118e2a
commit 0425118e2a
parent 9537e8b118 fe4b927e9c
168 changed files with 20927 additions and 2324 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-project(Eigen)
+project(Eigen3)

 cmake_minimum_required(VERSION 2.8.5)

@ -8,6 +8,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()

+# Alias Eigen_*_DIR to Eigen3_*_DIR:
+
+set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+
 # guard against bad build-type strings

 if (NOT CMAKE_BUILD_TYPE)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@ -31,7 +31,8 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/Core
+++ b/Eigen/Core
@ -328,7 +328,6 @@ using std::ptrdiff_t;

 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
-#include "src/Core/SpecialFunctions.h"
 #include "src/Core/GenericPacketMath.h"

 #if defined EIGEN_VECTORIZE_AVX
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@ -45,9 +45,10 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/LU
+++ b/Eigen/LU
@ -28,7 +28,8 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
--- a/Eigen/QR
+++ b/Eigen/QR
@ -36,8 +36,9 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/SVD
+++ b/Eigen/SVD
@ -37,7 +37,8 @@
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif

 #include "src/Core/util/ReenableStupidWarnings.h"
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@ -43,7 +43,7 @@ namespace Eigen { struct SluMatrix; }
  * - class SuperLU: a supernodal sequential LU factorization.
  * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
  *
-  * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
  *
  * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
  *
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -43,6 +43,8 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
@ -52,7 +54,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
      UpLo = _UpLo
@ -61,7 +62,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
    typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;

    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
@ -97,6 +98,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
    /** \brief Constructor with decomposition
      *
      * This calculates the decomposition for the input \a matrix.
+      *
      * \sa LDLT(Index size)
      */
    template<typename InputType>
@ -110,6 +112,23 @@ template<typename _MatrixType, int _UpLo> class LDLT
      compute(matrix.derived());
    }

+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LDLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
+    }
+
    /** Clear any existing decomposition
     * \sa rankUpdate(w,sigma)
     */
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -41,6 +41,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
@ -54,7 +56,6 @@ template<typename _MatrixType, int _UpLo> class LLT
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
    typedef typename MatrixType::Scalar Scalar;
@ -95,6 +96,21 @@ template<typename _MatrixType, int _UpLo> class LLT
      compute(matrix.derived());
    }

+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
+    }
+
    /** \returns a view of the upper triangular matrix U */
    inline typename Traits::MatrixU matrixU() const
    {
--- a/Eigen/src/Cholesky/LLT_LAPACKE.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@ -25,25 +25,22 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *     LLt decomposition based on LAPACKE_?potrf function.
 ********************************************************************************
 */

-#ifndef EIGEN_LLT_MKL_H
-#define EIGEN_LLT_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-#include <iostream>
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H

 namespace Eigen { 

 namespace internal {

-template<typename Scalar> struct mkl_llt;
+template<typename Scalar> struct lapacke_llt;

-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
+template<> struct lapacke_llt<EIGTYPE> \
 { \
  template<typename MatrixType> \
  static inline Index potrf(MatrixType& m, char uplo) \
@ -59,7 +56,7 @@ template<> struct mkl_llt<EIGTYPE> \
    a = &(m.coeffRef(0,0)); \
    lda = convert_index<lapack_int>(m.outerStride()); \
 \
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
    info = (info==0) ? -1 : info>0 ? info-1 : size; \
    return info; \
  } \
@ -69,7 +66,7 @@ template<> struct llt_inplace<EIGTYPE, Lower> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@ -80,7 +77,7 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  template<typename MatrixType> \
  static Index blocked(MatrixType& m) \
  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
  } \
  template<typename MatrixType, typename VectorType> \
  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@ -90,13 +87,13 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
  } \
 };

-EIGEN_MKL_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)

 } // end namespace internal

 } // end namespace Eigen

-#endif // EIGEN_LLT_MKL_H
+#endif // EIGEN_LLT_LAPACKE_H
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -88,10 +88,11 @@ private:
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
      /* slice vectorization can be slow, so we only want it if the slices are big, which is
         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
  };

 public:
@ -136,6 +137,11 @@ public:
              : int(Traversal) == int(LinearTraversal)
                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                              : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
              : int(NoUnrolling)
  };

@ -165,6 +171,7 @@ public:
    EIGEN_DEBUG_VAR(MayLinearVectorize)
    EIGEN_DEBUG_VAR(MaySliceVectorize)
    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
    EIGEN_DEBUG_VAR(UnrollingLimit)
    EIGEN_DEBUG_VAR(MayUnrollCompletely)
    EIGEN_DEBUG_VAR(MayUnrollInner)
@ -276,24 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };

-template<typename Kernel, int Index_, int Stop>
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
  typedef typename Kernel::PacketType PacketType;
-  enum {
-    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
-    DstAlignment = Kernel::AssignmentTraits::DstAlignment
-  };
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
  {
    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
  }
 };

-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
 {
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
 };
@ -422,9 +425,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
    
    enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           packetSize =unpacket_traits<PacketType>::size,
           alignedSize = (size/packetSize)*packetSize };

    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
@ -471,9 +475,11 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
  {
    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
    const Index outerSize = kernel.outerSize();
    for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
  }
 };

@ -553,6 +559,29 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
  }
 };

+#if EIGEN_UNALIGNED_VECTORIZE
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { size = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (size/packetSize)*packetSize };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+
 /***************************************************************************
 * Part 4 : Generic dense assignment kernel
 ***************************************************************************/
@ -680,7 +709,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX
    
  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
  dense_assignment_loop<Kernel>::run(kernel);
 }

@ -709,7 +738,7 @@ template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Ki
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
-          typename Scalar = typename DstXprType::Scalar>
+          typename EnableIf = void>
 struct Assignment;


@ -816,8 +845,10 @@ void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);

 // Generic Dense to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
@ -834,8 +865,10 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>

 // Generic assignment through evalTo.
 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@ -81,10 +81,10 @@ class vml_assign_traits

 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
  template< typename DstXprType, typename SrcXprNested>                                                                         \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@ -138,22 +138,24 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)

 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
-  template< typename DstXprType, typename SrcXprNested>                                                                       \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
-    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
-      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
      {                                                                                                                       \
-        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
      } else {                                                                                                                \
        const Index outerSize = dst.outerSize();                                                                              \
        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
-          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
-                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@ -132,7 +132,8 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
  // Hager's algorithm to vastly underestimate ||matrix||_1.
  Scalar alternating_sign(RealScalar(1));
  for (Index i = 0; i < n; ++i) {
-    v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
    alternating_sign = -alternating_sign;
  }
  v = dec.solve(v);
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@ -316,8 +316,8 @@ struct Diagonal2Dense {};
 template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };

 // Diagonal matrix to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -306,7 +306,7 @@ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a
  // 32-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
 #endif
-#elif !EIGEN_COMP_MSVC
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
  __builtin_prefetch(addr);
 #endif
 }
@ -435,42 +435,6 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }

-/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
-
-/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
-    
-/** \internal \returns the zeta function of two arguments (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
-
-/** \internal \returns the polygamma function (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
-
-/** \internal \returns the erf(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet perf(const Packet& a) { using numext::erf; return erf(a); }
-
-/** \internal \returns the erfc(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
-
-/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
-
-/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
-
-/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
-
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -78,6 +78,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
@ -102,8 +103,7 @@ namespace Eigen
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
  template<typename Derived,typename ScalarExponent>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value)
-                                      && ScalarBinaryOpTraits<typename Derived::Scalar,ScalarExponent,internal::scalar_pow_op<typename Derived::Scalar,ScalarExponent> >::Defined,
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
    return x.derived().pow(exponent);
@ -156,8 +156,7 @@ namespace Eigen
  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
  template<typename Scalar, typename Derived>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value)
-                                      && ScalarBinaryOpTraits<Scalar,typename Derived::Scalar,internal::scalar_pow_op<Scalar,typename Derived::Scalar> >::Defined,
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
  {
@ -174,111 +173,6 @@ namespace Eigen
  }
 #endif

-  /** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
-    *
-    * This function computes the coefficient-wise incomplete gamma function.
-    *
-    * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-    * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
-    * type T to be supported.
-    *
-    * \sa Eigen::igammac(), Eigen::lgamma()
-    */
-  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
-  igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
-      a.derived(),
-      x.derived()
-    );
-  }
-
-  /** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
-    *
-    * This function computes the coefficient-wise complementary incomplete gamma function.
-    *
-    * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-    * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
-    * type T to be supported.
-    *
-    * \sa Eigen::igamma(), Eigen::lgamma()
-    */
-  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
-  igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
-      a.derived(),
-      x.derived()
-    );
-  }
-
-  /** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
-    *
-    * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
-    *
-    * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-    * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
-    * type T to be supported.
-    *
-    * \sa Eigen::digamma()
-    */
-  // * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
-  // * \sa ArrayBase::polygamma()
-  template<typename DerivedN,typename DerivedX>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
-  polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
-      n.derived(),
-      x.derived()
-    );
-  }
-
-  /** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
-    *
-    * This function computes the regularized incomplete beta function (integral).
-    *
-    * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-    * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
-    * type T to be supported.
-    *
-    * \sa Eigen::betainc(), Eigen::lgamma()
-    */
-  template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-  inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
-  betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
-  {
-    return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
-      a.derived(),
-      b.derived(),
-      x.derived()
-    );
-  }
-
-
-  /** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
-    *
-    * It returns the Riemann zeta function of two arguments \a x and \a q:
-    *
-    * \param x is the exposent, it must be > 1
-    * \param q is the shift, it must be > 0
-    *
-    * \note This function supports only float and double scalar types. To support other scalar types, the user has
-    * to provide implementations of zeta(T,T) for any scalar type T to be supported.
-    *
-    * \sa ArrayBase::zeta()
-    */
-  template<typename DerivedX,typename DerivedQ>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
-  zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
-      x.derived(),
-      q.derived()
-    );
-  }

  namespace internal
  {
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@ -125,29 +125,17 @@ DenseBase<Derived>::format(const IOFormat& fmt) const

 namespace internal {

-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    return cast<RealScalar,int>(numext::ceil(-numext::log(NumTraits<RealScalar>::epsilon())/numext::log(RealScalar(10))));
-  }
-};
-
-template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
-{
-  static inline int run()
-  {
-    return 0;
-  }
-};
-
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
 template<typename Scalar>
 struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
+{
+  static inline int run()
+  {
+    return NumTraits<Scalar>::digits10();
+  }
+};

 /** \internal
  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@ -12,6 +12,37 @@

 namespace Eigen {

+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
 /** \class NumTraits
  * \ingroup Core_Module
  *
@ -44,10 +75,14 @@ namespace Eigen {
  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
  */

 template<typename T> struct GenericNumTraits
@ -93,6 +128,13 @@ template<typename T> struct GenericNumTraits
  {
    return numext::numeric_limits<T>::epsilon();
  }
+
+  EIGEN_DEVICE_FUNC
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
@ -161,6 +203,8 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };

 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@ -190,6 +234,27 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 };

+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
+
+  static inline int digits10() { return 0; }
+
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
+};
+
 } // end namespace Eigen

 #endif // EIGEN_NUMTRAITS_H
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -134,7 +134,7 @@ protected:
 // Dense = Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static EIGEN_STRONG_INLINE
@ -148,7 +148,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
 // Dense += Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static EIGEN_STRONG_INLINE
@ -162,7 +162,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
 // Dense -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
  static EIGEN_STRONG_INLINE
@ -179,7 +179,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
 // for instance, this is not good for inner products
 template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
 struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
 {
  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
@ -259,7 +259,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>

 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
  evaluator<Rhs> rhsEval(rhs);
  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
@ -272,7 +272,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons

 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
  evaluator<Lhs> lhsEval(lhs);
  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
@ -489,11 +489,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
      
    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,

-    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ),
-
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ),
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),

    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@ -134,7 +134,7 @@ protected:
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
@ -146,7 +146,7 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar

 // Specialization for "dst = dec.transpose().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
@ -158,7 +158,7 @@ struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal:
 // Specialization for "dst = dec.adjoint().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
 struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
-                  internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@ -812,8 +812,8 @@ template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Tria
 template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };


-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -823,8 +823,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar
  }
 };

-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -832,8 +832,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
  }
 };

-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -933,7 +933,7 @@ namespace internal {
  
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
@ -945,7 +945,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_

 // Triangular += Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
@ -956,7 +956,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass

 // Triangular -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -284,6 +284,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
    typedef Reverse<ExpressionType, Direction> ReverseReturnType;

    template<int p> struct LpNormReturnType {
@ -456,7 +457,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
      *
      * \sa DenseBase::reverse() */
    EIGEN_DEVICE_FUNC
-    const ReverseReturnType reverse() const
+    const ConstReverseReturnType reverse() const
+    { return ConstReverseReturnType( _expression() ); }
+
+    /** \returns a writable matrix expression
+      * where each column (or row) are reversed.
+      *
+      * \sa reverse() const */
+    EIGEN_DEVICE_FUNC
+    ReverseReturnType reverse()
    { return ReverseReturnType( _expression() ); }

    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -15,18 +16,20 @@ namespace Eigen {
 namespace internal {

 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef _BIG_ENDIAN
+#ifdef __VSX__
+#if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
+#endif

 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
  Packet4f  v;
 };
@ -39,6 +42,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
+    HasHalfPacket = 0,

    HasAdd    = 1,
    HasSub    = 1,
@ -49,6 +53,9 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
+#ifdef __VSX__
+    HasBlend  = 1,
+#endif
    HasSetLinear = 0
  };
 };
@ -58,7 +65,6 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
  Packet2cf res;
-  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
  if((ptrdiff_t(&from) % 16) == 0)
    res.v = pload<Packet4f>((const float *)&from);
  else
@ -67,26 +73,32 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
  return res;
 }

+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
-  return Packet2cf(vec_ld(0, (const float*)af));
+  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
-  vec_st(from.v, 0, (float*)af);
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }

-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }

 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@ -100,30 +112,19 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
  v1 = vec_madd(v1, b.v, p4f_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
+  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
  // permute back to a proper order
  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
  
-  return Packet2cf(vec_add(v1, v2));
+  return Packet2cf(padd<Packet4f>(v1, v2));
 }

-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }

-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
-{
-  return pset1<Packet2cf>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@ -143,23 +144,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
  Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
-  return pfirst(Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
 }

 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
  Packet4f b1, b2;
 #ifdef _BIG_ENDIAN  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
-  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
 #endif
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
+  b2 = vec_sld(b2, b2, 8);
+  b2 = padd<Packet4f>(b1, b2);

  return Packet2cf(b2);
 }
@ -168,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
 {
  Packet4f b;
  Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));

-  return pfirst(prod);
+  return pfirst<Packet2cf>(prod);
 }

 template<int Offset>
@ -223,12 +224,30 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };

+template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cf, Packet4f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
+};
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }

 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@ -243,6 +262,14 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
  kernel.packet[0].v = tmp;
 }

+#ifdef __VSX__
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
 //---------- double ----------
 #ifdef __VSX__
 struct Packet1cd
@ -277,10 +304,10 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits

 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };

-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }

 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
@ -300,10 +327,10 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
  to[1*stride] = af[1];
 }

-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }

 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@ -317,23 +344,20 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
  v1 = vec_madd(a_re, b.v, p2d_ZERO);
  // multiply a_im * b and get the conjugate result
  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
-  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));

-  return Packet1cd(vec_add(v1, v2));
+  return Packet1cd(padd<Packet2d>(v1, v2));
 }

-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }

-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
-{
-  return pset1<Packet1cd>(*from);
-}
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }

-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }

 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@ -345,20 +369,10 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac

 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }

-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }

-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }

 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
@ -402,13 +416,30 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
    return pconj(internal::pmul(a, b));
  }
 };
+template<> struct conj_helper<Packet2d, Packet1cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
+};

 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for AltiVec
  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }

 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@ -3,6 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -19,38 +20,79 @@ namespace Eigen {

 namespace internal {

+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+#ifdef __VSX__
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+static Packet2l p2l_1023 = { 1023, 1023 };
+static Packet2ul p2ul_52 = { 52, 52 };
+
+#endif
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-  
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-

  Packet4i emm0;

@ -112,36 +154,17 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);

  Packet4f tmp, fx;
  Packet4i emm0;

  // clamp x
-  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);

-  /* express exp(x) as exp(g + n*log(2)) */
+  // express exp(x) as exp(g + n*log(2))
  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);

-  fx = vec_floor(fx);
+  fx = pfloor(fx);

  tmp = pmul(fx, p4f_cephes_exp_C1);
  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
@ -171,14 +194,44 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
                 isnumber_mask);
 }

+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
 #ifdef __VSX__
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_sqrt(x);
+}
+
 // VSX support varies between different compilers and even different
 // versions of the same compiler.  For gcc version >= 4.9.3, we can use
 // vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
 // a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 0) || \
-    (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
  return vec_cts(x, 0);    // TODO: check clang version.
 #else
  double tmp[2];
@ -194,36 +247,16 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  Packet2d x = _x;

-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
  Packet2d tmp, fx;
  Packet2l emm0;

  // clamp x
  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);

-  fx = vec_floor(fx);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
+
+  fx = pfloor(fx);

  tmp = pmul(fx, p2d_cephes_exp_C1);
  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
@ -249,9 +282,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = ConvertToPacket2l(fx);

 #ifdef __POWER8_VECTOR__ 
-  static const Packet2l p2l_1023 = { 1023, 1023 };
-  static const Packet2ul p2ul_52 = { 52, 52 };
-
  emm0 = vec_add(emm0, p2l_1023);
  emm0 = vec_sl(emm0, p2ul_52);
 #else
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -42,7 +42,7 @@ typedef __vector unsigned char  Packet16uc;
 // and it doesn't really work to declare them global, so we define macros instead

 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))

 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
  Packet4i p4i_##NAME = vec_splat_s32(X)
@ -69,13 +69,13 @@ typedef __vector unsigned char  Packet16uc;
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-#ifndef __VSX__
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
-#endif
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+#ifndef __VSX__
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif

 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
@ -95,8 +95,10 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+#ifdef __VSX__
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif
 static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
@ -110,8 +112,8 @@ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i

 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};

 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };

@ -121,6 +123,12 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8
 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif // _BIG_ENDIAN

+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
@ -129,15 +137,35 @@ template<> struct packet_traits<float>  : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
-    HasHalfPacket=0,
+    HasHalfPacket = 1,

-    // FIXME check the Has*
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
    HasSin  = 0,
    HasCos  = 0,
-    HasLog  = 1,
+    HasLog  = 0,
    HasExp  = 1,
-    HasSqrt = 0
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
  };
 };
 template<> struct packet_traits<int>    : default_packet_traits
@ -145,10 +173,16 @@ template<> struct packet_traits<int>    : default_packet_traits
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
-    // FIXME check the Has*
    Vectorizable = 1,
    AlignedOnScalar = 1,
-    size=4
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
  };
 };

@ -200,41 +234,56 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
  return s;
 }
-/*
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
-{
-  union {
-    Packet4bi v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}*/
-

 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}

-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from;
-  Packet4f vc = pload<Packet4f>(af);
-  vc = vec_splat(vc, 0);
-  return vc;
+  Packet4f v = {from, from, from, from};
+  return v;
 }

 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from;
-  Packet4i vc = pload<Packet4i>(ai);
-  vc = vec_splat(vc, 0);
-  return vc;
+  Packet4i v = {from, from, from, from};
+  return v;
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
@ -294,58 +343,24 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
  to[3*stride] = ai[3];
 }

-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }

-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }

-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }

-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }

 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }

-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
-  //Set up constants, variables
-  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }

-  // Get the absolute values
-  a1  = vec_abs(a);
-  b1  = vec_abs(b);
-
-  // Get the signs using xor
-  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
-
-  // Do the multiplication for the asbolute values.
-  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
-  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
-  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
-  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
-  prod = vec_add( low_prod, high_prod );
-
-  // NOR the product and select only the negative elements according to the sign mask
-  prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
-
-  // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
-  prod_ = vec_add(prod_, v1sel);
-
-  // Merge the results back to the final vector.
-  prod = vec_sel(prod, prod_, sgn);
-
-  return prod;
-}
-*/
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #ifndef __VSX__  // VSX actually provides a div instruction
@ -370,8 +385,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 }

 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }

 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
@ -391,6 +406,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }

+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+
 #ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
@ -418,12 +437,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
 }
 #endif
@ -494,16 +513,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f&
 }
 #endif

-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }

-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }

 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@ -511,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
-  sum = vec_add(a, b);
-  b   = (Packet4f) vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
+  b   = vec_sld(a, a, 8);
+  sum = a + b;
+  b   = vec_sld(sum, sum, 4);
+  sum += b;
  return pfirst(sum);
 }

@ -537,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)

  // Now do the summation:
  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];

  return sum[0];
 }
@ -577,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)

  // Now do the summation:
  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];

  return sum[0];
 }
@ -591,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
  Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }

 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@ -716,33 +738,52 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
  kernel.packet[3] = vec_mergel(t1, t3);
 }

+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+

 //---------- double ----------
 #ifdef __VSX__
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
-
-static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
-static Packet2d p2d_ONE = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
-
-#ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+#if EIGEN_COMP_CLANG
+typedef Packet2ul                    Packet2bl;
 #else
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+typedef __vector __bool long         Packet2bl;
 #endif

-static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+static Packet2l  p2l_ONE  = { 1, 1 };
+static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d  p2d_ZERO_ = { -0.0, -0.0 };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template<int index> Packet2d vec_splat_dbl(Packet2d& a);
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
 {
-  switch (index) {
-  case 0:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
-  case 1:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
-  }
-  return a;
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
 }

 template<> struct packet_traits<double> : default_packet_traits
@ -753,16 +794,41 @@ template<> struct packet_traits<double> : default_packet_traits
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=2,
-    HasHalfPacket = 0,
+    HasHalfPacket = 1,

+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
    HasExp  = 1,
-    HasSqrt = 0
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
  };
 };

 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };

+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  union {
+    Packet2l   v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}

 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 {
@ -776,28 +842,43 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 }

 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}

-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}

 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
-  double EIGEN_ALIGN16 af[2];
-  af[0] = from;
-  Packet2d vc = pload<Packet2d>(af);
-  vc = vec_splat_dbl(vc, 0);
-  return vc;
+  Packet2d v = {from, from};
+  return v;
 }
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl(a1, 0);
-  a1 = vec_splat_dbl(a1, 1);
+  a0 = vec_splat_dbl<0>(a1);
+  a1 = vec_splat_dbl<1>(a1);
  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl(a3, 0);
-  a3 = vec_splat_dbl(a3, 1);
+  a2 = vec_splat_dbl<0>(a3);
+  a3 = vec_splat_dbl<1>(a3);
 }
+
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
  double EIGEN_ALIGN16 af[2];
@ -812,13 +893,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }

-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }

-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }

-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }

 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

@ -840,17 +922,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const

 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }

+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
  Packet2d p;
  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
  else                             p = ploadu<Packet2d>(from);
-  return vec_perm(p, p, p16uc_PSET64_HI);
+  return vec_splat_dbl<0>(p);
 }

 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
@ -859,32 +946,34 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }

-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }

-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }

+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }

 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
  Packet2d b, sum;
-  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
-  sum = vec_add(a, b);
-  return pfirst(sum);
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
 }

 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
  Packet2d v[2], sum;
-  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
-  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
+  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
+  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
 
 #ifdef _BIG_ENDIAN
- sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
-  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
 #endif

  return sum;
@ -893,19 +982,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }

 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }

 // max
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }

 template<int Offset>
@ -915,9 +1004,9 @@ struct palign_impl<Offset,Packet2d>
  {
    if (Offset == 1)
 #ifdef _BIG_ENDIAN
-      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
 #else
-      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
 #endif
  }
 };
@ -931,6 +1020,11 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
  kernel.packet[1] = t1;
 }

+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
 #endif // __VSX__
 } // end namespace internal

--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@ -1,11 +1,3 @@
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// class Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-//
-//
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
@ -32,6 +24,15 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
 #ifndef EIGEN_HALF_CUDA_H
 #define EIGEN_HALF_CUDA_H

@ -42,6 +43,12 @@
 #endif


+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
 #if !defined(EIGEN_HAS_CUDA_FP16)

 // Make our own __half definition that is similar to CUDA's.
@ -53,70 +60,76 @@ struct __half {

 #endif

-namespace Eigen {
-
-namespace internal {
-
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);

-} // end namespace internal
+struct half_base : public __half {
+  EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+};
+
+} // namespace half_impl

 // Class definition.
-struct half : public __half {
+struct half : public half_impl::half_base {
+  #if !defined(EIGEN_HAS_CUDA_FP16)
+    typedef half_impl::__half __half;
+  #endif
+
  EIGEN_DEVICE_FUNC half() {}

-  EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}

  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
  template<class T>
  explicit EIGEN_DEVICE_FUNC half(const T& val)
-      : __half(internal::float_to_half_rtne(static_cast<float>(val))) {}
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
  explicit EIGEN_DEVICE_FUNC half(float f)
-      : __half(internal::float_to_half_rtne(f)) {}
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}

  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
    // +0.0 and -0.0 become false, everything else becomes true.
    return (x & 0x7fff) != 0;
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(internal::half_to_float(*this));
+    return static_cast<signed char>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(internal::half_to_float(*this));
+    return static_cast<unsigned char>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(internal::half_to_float(*this));
+    return static_cast<short>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(internal::half_to_float(*this));
+    return static_cast<unsigned short>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(internal::half_to_float(*this));
+    return static_cast<int>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(internal::half_to_float(*this));
+    return static_cast<unsigned int>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(internal::half_to_float(*this));
+    return static_cast<long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(internal::half_to_float(*this));
+    return static_cast<unsigned long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(internal::half_to_float(*this));
+    return static_cast<long long>(half_impl::half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(internal::half_to_float(*this));
+    return static_cast<unsigned long long>(half_to_float(*this));
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return internal::half_to_float(*this);
+    return half_impl::half_to_float(*this);
  }
  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(internal::half_to_float(*this));
+    return static_cast<double>(half_impl::half_to_float(*this));
  }

  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
@ -125,6 +138,8 @@ struct half : public __half {
  }
 };

+namespace half_impl {
+
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530

 // Intrinsics for native fp16 support. Note that on current hardware,
@ -246,7 +261,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to half.
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
-  return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
+  return half(static_cast<float>(a) / static_cast<float>(b));
 }

 // Conversion routines, including fallbacks for the host or older CUDA.
@ -254,8 +269,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.

-namespace internal {
-
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
  __half h;
  h.x = x;
@ -351,92 +364,62 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
 #endif
 }

-} // end namespace internal
+// --- standard functions ---

-// Traits.
-
-namespace internal {
-
-template<> struct is_arithmetic<half> { enum { value = true }; };
-
-} // end namespace internal
-
-template<> struct NumTraits<Eigen::half>
-    : GenericNumTraits<Eigen::half>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return internal::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-2f); }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return internal::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return internal::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return internal::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return internal::raw_uint16_to_half(0x7c01);
-  }
-};
-
-// Infinity/NaN checks.
-
-namespace numext {
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
  return (a.x & 0x7fff) == 0x7c00;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hisnan(a);
 #else
  return (a.x & 0x7fff) > 0x7c00;
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) {
-  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
 }

-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) {
-  Eigen::half result;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+  half result;
  result.x = a.x & 0x7FFF;
  return result;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+  return half(::expf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
-  return Eigen::half(::logf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+  return half(::logf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+  return half(::sqrtf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) {
-  return Eigen::half(::sinf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) {
-  return Eigen::half(::cosf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) {
-  return Eigen::half(::tanf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) {
-  return Eigen::half(::tanhf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+  return half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+  return half(::ceilf(float(a)));
 }

-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(b, a) ? b : a;
 #else
@ -445,7 +428,7 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::
  return f2 < f1 ? b : a;
 #endif
 }
-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(a, b) ? b : a;
 #else
@ -455,40 +438,59 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::
 #endif
 }

-#if EIGEN_HAS_C99_MATH
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
-  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
-}
-#endif
-} // end namespace numext
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+  static inline half run(const half& x, const half& y)
+  {
+    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run()
+  {
+    return run(half(-1.f), half(1.f));
+  }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7c01);
+  }
+};

 } // end namespace Eigen

-// Standard mathematical functions and trancendentals.
+// C-like standard mathematical functions and trancendentals.
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
  Eigen::half result;
  result.x = a.x & 0x7FFF;
@ -512,24 +514,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
  return Eigen::half(::ceilf(float(a)));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) {
-  return (Eigen::numext::isnan)(a);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) {
-  return (Eigen::numext::isinf)(a);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) {
-  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
-}
-

 namespace std {

-EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
 #if __cplusplus > 199711L
 template <>
 struct hash<Eigen::half> {
@ -552,10 +539,36 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 // ldg() has an overload for __half, but we also need one for Eigen::half.
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::internal::raw_uint16_to_half(
+  return Eigen::half_impl::raw_uint16_to_half(
      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
 }
 #endif


+#if defined(__CUDA_ARCH__)
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+} // namespace Eigen
+}  // namespace numext
+#endif
+
 #endif // EIGEN_HALF_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@ -27,6 +27,7 @@ float4 plog<float4>(const float4& a)
 template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plog<double2>(const double2& a)
 {
+  using ::log;
  return make_double2(log(a.x), log(a.y));
 }

@ -39,6 +40,7 @@ float4 pexp<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pexp<double2>(const double2& a)
 {
+  using ::exp;
  return make_double2(exp(a.x), exp(a.y));
 }

@ -51,6 +53,7 @@ float4 psqrt<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 psqrt<double2>(const double2& a)
 {
+  using ::sqrt;
  return make_double2(sqrt(a.x), sqrt(a.y));
 }

@ -66,138 +69,6 @@ double2 prsqrt<double2>(const double2& a)
  return make_double2(rsqrt(a.x), rsqrt(a.y));
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
-  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
-  return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pdigamma<float4>(const float4& a)
-{
-  using numext::digamma;
-  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pdigamma<double2>(const double2& a)
-{
-  using numext::digamma;
-  return make_double2(digamma(a.x), digamma(a.y));
-}
-    
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pzeta<float4>(const float4& x, const float4& q)
-{
-    using numext::zeta;
-    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pzeta<double2>(const double2& x, const double2& q)
-{
-    using numext::zeta;
-    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 ppolygamma<float4>(const float4& n, const float4& x)
-{
-    using numext::polygamma;
-    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 ppolygamma<double2>(const double2& n, const double2& x)
-{
-    using numext::polygamma;
-    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
-  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
-  return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
-  return make_float4(erfcf(a.x), erfcf(a.y), erfcf(a.z), erfcf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
-  return make_double2(erfc(a.x), erfc(a.y));
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigamma<float4>(const float4& a, const float4& x)
-{
-  using numext::igamma;
-  return make_float4(
-      igamma(a.x, x.x),
-      igamma(a.y, x.y),
-      igamma(a.z, x.z),
-      igamma(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigamma<double2>(const double2& a, const double2& x)
-{
-  using numext::igamma;
-  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigammac<float4>(const float4& a, const float4& x)
-{
-  using numext::igammac;
-  return make_float4(
-      igammac(a.x, x.x),
-      igammac(a.y, x.y),
-      igammac(a.z, x.z),
-      igammac(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double2& x)
-{
-  using numext::igammac;
-  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
-{
-  using numext::betainc;
-  return make_float4(
-      betainc(a.x, b.x, x.x),
-      betainc(a.y, b.y, x.y),
-      betainc(a.z, b.z, x.z),
-      betainc(a.w, b.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
-{
-  using numext::betainc;
-  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
-}

 #endif

--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@ -228,7 +228,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(internal::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
 #endif
 }

@ -262,7 +262,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(internal::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
 #endif
 }

@ -372,7 +372,7 @@ template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from)
 }

 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
-  return raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
 }

 template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
@ -607,7 +607,7 @@ template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from)
 }

 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
 }

 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
@ -618,17 +618,17 @@ template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const

  Eigen::half h[4];

-  Eigen::half ha = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = raw_uint16_to_half(static_cast<unsigned short>(b64));
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
  h[0] = ha + hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
  h[1] = ha + hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
  h[2] = ha + hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
  h[3] = ha + hb;
  Packet4h result;
  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
@ -641,17 +641,17 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const

  Eigen::half h[4];

-  Eigen::half ha = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = raw_uint16_to_half(static_cast<unsigned short>(b64));
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
  h[0] = ha * hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
  h[1] = ha * hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
  h[2] = ha * hb;
-  ha = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
  h[3] = ha * hb;
  Packet4h result;
  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@ -57,21 +57,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }

-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
-{
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
-{
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -429,57 +429,6 @@ template<> struct functor_traits<scalar_boolean_xor_op> {
  };
 };

-/** \internal
-  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
-  *
-  * \sa class CwiseBinaryOp, Cwise::igamma
-  */
-template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
-{
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
-    using numext::igamma; return igamma(a, x);
-  }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
-    return internal::pigamma(a, x);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_igamma_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasIGamma
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
-  *
-  * \sa class CwiseBinaryOp, Cwise::igammac
-  */
-template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
-{
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
-    using numext::igammac; return igammac(a, x);
-  }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
-  {
-    return internal::pigammac(a, x);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_igammac_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasIGammac
-  };
-};


 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
--- a/Eigen/src/Core/functors/TernaryFunctors.h
+++ b/Eigen/src/Core/functors/TernaryFunctors.h
@ -16,29 +16,7 @@ namespace internal {

 //---------- associative ternary functors ----------

-/** \internal
-  * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
-  *
-  */
-template<typename Scalar> struct scalar_betainc_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
-    using numext::betainc; return betainc(x, a, b);
-  }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
-  {
-    return internal::pbetainc(x, a, b);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_betainc_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBetaInc
-  };
-};
+

 } // end namespace internal

--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -472,142 +472,6 @@ struct functor_traits<scalar_asin_op<Scalar> >
 };


-/** \internal
- * \brief Template functor to compute the natural log of the absolute
- * value of Gamma of a scalar
- * \sa class CwiseUnaryOp, Cwise::lgamma()
- */
-template<typename Scalar> struct scalar_lgamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::lgamma; return lgamma(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_lgamma_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasLGamma
-  };
-};
-
-/** \internal
- * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
- * \sa class CwiseUnaryOp, Cwise::digamma()
- */
-template<typename Scalar> struct scalar_digamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::digamma; return digamma(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_digamma_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasDiGamma
-  };
-};
-    
-/** \internal
- * \brief Template functor to compute the Riemann Zeta function of two arguments.
- * \sa class CwiseUnaryOp, Cwise::zeta()
- */
-template<typename Scalar> struct scalar_zeta_op {
-    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
-        using numext::zeta; return zeta(x, q);
-    }
-    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_zeta_op<Scalar> >
-{
-    enum {
-        // Guesstimate
-        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-        PacketAccess = packet_traits<Scalar>::HasZeta
-    };
-};
-
-/** \internal
- * \brief Template functor to compute the polygamma function.
- * \sa class CwiseUnaryOp, Cwise::polygamma()
- */
-template<typename Scalar> struct scalar_polygamma_op {
-    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
-        using numext::polygamma; return polygamma(n, x);
-    }
-    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_polygamma_op<Scalar> >
-{
-    enum {
-        // Guesstimate
-        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-        PacketAccess = packet_traits<Scalar>::HasPolygamma
-    };
-};
-
-/** \internal
- * \brief Template functor to compute the Gauss error function of a
- * scalar
- * \sa class CwiseUnaryOp, Cwise::erf()
- */
-template<typename Scalar> struct scalar_erf_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erf; return erf(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_erf_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErf
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Complementary Error Function
- * of a scalar
- * \sa class CwiseUnaryOp, Cwise::erfc()
- */
-template<typename Scalar> struct scalar_erfc_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erfc; return erfc(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_erfc_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErfc
-  };
-};
-
-
 /** \internal
  * \brief Template functor to compute the atan of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::atan()
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -299,16 +299,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
  if (!useSpecificBlockingSizes(k, m, n)) {
    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
  }
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  if (k > kr) k -= k % kr;
-  if (m > mr) m -= m % mr;
-  if (n > nr) n -= n % nr;
 }

 template<typename LhsScalar, typename RhsScalar, typename Index>
@ -1536,12 +1526,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];

          // The following piece of code wont work for 512 bit registers
-          // Moreover it assumes that there is a half packet of the same size
+          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
          // as nr (which is currently 4) for the return type.
          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
          if ((SwappedTraits::LhsProgress % 4) == 0 &&
              (SwappedTraits::LhsProgress <= 8) &&
-              unpacket_traits<SResPacketHalf>::size==4)
+              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
          {
            SAccPacket C0, C1, C2, C3;
            straits.initAcc(C0);
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@ -111,7 +111,7 @@ template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::comp
 };

 template<typename From,typename To> struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };

 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -203,15 +203,21 @@ template<typename Scalar> struct scalar_random_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
 template<typename Scalar,bool iscpx> struct scalar_sign_op;
-template<typename Scalar> struct scalar_igamma_op;
-template<typename Scalar> struct scalar_igammac_op;
-template<typename Scalar> struct scalar_betainc_op;
-
 template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;

+// SpecialFunctions module
+template<typename Scalar> struct scalar_lgamma_op;
+template<typename Scalar> struct scalar_digamma_op;
+template<typename Scalar> struct scalar_erf_op;
+template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_igamma_op;
+template<typename Scalar> struct scalar_igammac_op;
+template<typename Scalar> struct scalar_zeta_op;
+template<typename Scalar> struct scalar_betainc_op;
+
 } // end namespace internal

 struct IOFormat;
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@ -49,7 +49,7 @@
  #define EIGEN_USE_LAPACKE
 #endif

-#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML)
  #define EIGEN_USE_MKL
 #endif

@ -72,7 +72,7 @@
 #endif

 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
+
 #define EIGEN_MKL_VML_THRESHOLD 128

 /* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -13,7 +13,7 @@

 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 92
+#define EIGEN_MINOR_VERSION 93

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -71,6 +71,15 @@
  #define EIGEN_COMP_MSVC 0
 #endif

+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name  ver   MSC_VER
+//  2008    9      1500
+//  2010   10      1600
+//  2012   11      1700
+//  2013   12      1800
+//  2015   14      1900
+//  "15"   15      1900
+
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
@ -897,6 +906,9 @@ namespace Eigen {
    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
  }

+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
+
 #define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
@ -905,20 +917,27 @@ namespace Eigen {
  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>

+// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
+#if EIGEN_COMP_MSVC_STRICT<=1600
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
+#else
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
+#endif
+
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
  template <typename T> EIGEN_DEVICE_FUNC inline \
-  const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA ScalarBinaryOpTraits<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<Scalar EIGEN_COMMA T> >::Defined>::type,OPNAME) \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
  (METHOD)(const T& scalar) const { \
-    typedef typename internal::promote_scalar_arg<Scalar,T,ScalarBinaryOpTraits<Scalar,T,EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<Scalar,T> >::Defined>::type PromotedT; \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
  }

 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
  template <typename T> EIGEN_DEVICE_FUNC inline friend \
-  const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA ScalarBinaryOpTraits<T EIGEN_COMMA Scalar EIGEN_COMMA EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<T EIGEN_COMMA Scalar> >::Defined>::type,Derived,OPNAME) \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
-    typedef typename internal::promote_scalar_arg<Scalar,T,ScalarBinaryOpTraits<T,Scalar,EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<T,Scalar> >::Defined>::type PromotedT; \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
  }
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -16,6 +16,10 @@
 #include <math_constants.h>
 #endif

+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#include <cstdint>
+#endif
+
 namespace Eigen {

 namespace internal {
@ -29,7 +33,7 @@ namespace internal {

 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600
+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
@ -145,7 +149,7 @@ struct is_convertible
 /** \internal Allows to enable/disable an overload
  * according to a compile time condition.
  */
-template<bool Condition, typename T> struct enable_if;
+template<bool Condition, typename T=void> struct enable_if;

 template<typename T> struct enable_if<true,T>
 { typedef T type; };
@ -354,6 +358,19 @@ struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
 };
 #endif

+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType
+{
+  typedef char yes[1];
+  typedef char no[2];
+
+  template <typename C> static yes& testFunctor(C const *, typename C::ReturnType const * = 0);
+  static no& testFunctor(...);
+
+  static const bool value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(yes);
+};
+
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
  * Usage example: \code meta_sqrt<1023>::ret \endcode
  */
@ -431,67 +448,6 @@ T div_ceil(const T &a, const T &b)

 } // end namespace numext

-
-/** \class ScalarBinaryOpTraits
-  * \ingroup Core_Module
-  *
-  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
-  *
-  * \sa CwiseBinaryOp
-  */
-template<typename ScalarA, typename ScalarB, typename BinaryOp=void>
-struct ScalarBinaryOpTraits
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
-  : internal::scalar_product_traits<ScalarA,ScalarB>
-#endif // EIGEN_PARSED_BY_DOXYGEN
-{};
-
-template<typename T, typename BinaryOp>
-struct ScalarBinaryOpTraits<T,T,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef T ReturnType;
-};
-
-// For Matrix * Permutation
-template<typename T, typename BinaryOp>
-struct ScalarBinaryOpTraits<T,void,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef T ReturnType;
-};
-
-// For Permutation * Matrix
-template<typename T, typename BinaryOp>
-struct ScalarBinaryOpTraits<void,T,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef T ReturnType;
-};
-
-// for Permutation*Permutation
-template<typename BinaryOp>
-struct ScalarBinaryOpTraits<void,void,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef void ReturnType;
-};
-
-template<typename T, typename BinaryOp>
-struct ScalarBinaryOpTraits<T,std::complex<T>,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef std::complex<T> ReturnType;
-};
-
-template<typename T, typename BinaryOp>
-struct ScalarBinaryOpTraits<std::complex<T>, T,BinaryOp>
-{
-  enum { Defined = 1 };
-  typedef std::complex<T> ReturnType;
-};
-
 } // end namespace Eigen

 #endif // EIGEN_META_H
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -48,7 +48,7 @@ inline IndexDest convert_index(const IndexSrc& idx) {
 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
 // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
-// The IsSupported template parameter must be provided by the caller as: ScalarBinaryOpTraits<ExprScalar,T,op>::Defined using the proper order for ExprScalar and T.
+// The IsSupported template parameter must be provided by the caller as: internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
 // Then the logic is as follows:
 //  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is returned.
 //  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
@ -708,12 +708,88 @@ std::string demangle_flags(int f)

 } // end namespace internal

+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between instances of \c U1 and \c U2 returns an \c U3.
+  * You can let Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * \sa CwiseBinaryOp
+  */
+template<typename ScalarA, typename ScalarB, typename BinaryOp=internal::scalar_product_op<ScalarA,ScalarB> >
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+  : internal::scalar_product_traits<ScalarA,ScalarB>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,void,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template<typename BinaryOp>
+struct ScalarBinaryOpTraits<void,void,BinaryOp>
+{
+  typedef void ReturnType;
+};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,std::complex<T>,BinaryOp>
+{
+  typedef std::complex<T> ReturnType;
+};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<std::complex<T>, T,BinaryOp>
+{
+  typedef std::complex<T> ReturnType;
+};
+
 // We require Lhs and Rhs to have "compatible" scalar types.
 // It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
 // So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
 // add together a float matrix and a double matrix.
 #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT(int(ScalarBinaryOpTraits<LHS, RHS,BINOP>::Defined), \
+  EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
    
 } // end namespace Eigen
--- a/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
@ -25,21 +25,19 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
 ********************************************************************************
 */

-#ifndef EIGEN_COMPLEX_SCHUR_MKL_H
-#define EIGEN_COMPLEX_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H
+#define EIGEN_COMPLEX_SCHUR_LAPACKE_H

 namespace Eigen { 

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SCHUR_COMPLEX(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
@ -61,9 +59,9 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Eigen
      return *this; \
  } \
  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
  char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT1 select = 0; \
  jobvs = (computeU) ? 'V' : 'N'; \
  m_matU.resize(n, n); \
  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
@ -71,7 +69,7 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Eigen
  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
  Matrix<EIGTYPE, Dynamic, Dynamic> w; \
  w.resize(n, 1);\
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)w.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
  if(info == 0) \
    m_info = Success; \
  else \
@ -83,11 +81,11 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Eigen
 \
 }

-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, RowMajor, LAPACK_ROW_MAJOR)

 } // end namespace Eigen

-#endif // EIGEN_COMPLEX_SCHUR_MKL_H
+#endif // EIGEN_COMPLEX_SCHUR_LAPACKE_H
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@ -324,11 +324,12 @@ template<typename MatrixType>
 MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
 {
  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
  Index n = m_eivalues.rows();
  MatrixType matD = MatrixType::Zero(n,n);
  for (Index i=0; i<n; ++i)
  {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i)), precision))
      matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
    else
    {
@ -345,11 +346,12 @@ typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eige
 {
  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
  Index n = m_eivec.cols();
  EigenvectorsType matV(n,n);
  for (Index j=0; j<n; ++j)
  {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j)), precision) || j+1==n)
    {
      // we have a real eigen value
      matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
@ -451,26 +453,6 @@ EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool comput
  return *this;
 }

-// Complex scalar division.
-template<typename Scalar>
-std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
-{
-  using std::abs;
-  Scalar r,d;
-  if (abs(yr) > abs(yi))
-  {
-      r = yi/yr;
-      d = yr + r*yi;
-      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
-  }
-  else
-  {
-      r = yr/yi;
-      d = yi + r*yr;
-      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
-  }
-}
-

 template<typename MatrixType>
 void EigenSolver<MatrixType>::doComputeEigenvectors()
@ -557,7 +539,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
      }
      else
      {
-        std::complex<Scalar> cc = cdiv<Scalar>(Scalar(0),-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        ComplexScalar cc = ComplexScalar(Scalar(0),-m_matT.coeff(n-1,n)) / ComplexScalar(m_matT.coeff(n-1,n-1)-p,q);
        m_matT.coeffRef(n-1,n-1) = numext::real(cc);
        m_matT.coeffRef(n-1,n) = numext::imag(cc);
      }
@ -580,7 +562,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
          l = i;
          if (m_eivalues.coeff(i).imag() == RealScalar(0))
          {
-            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
+            ComplexScalar cc = ComplexScalar(-ra,-sa) / ComplexScalar(w,q);
            m_matT.coeffRef(i,n-1) = numext::real(cc);
            m_matT.coeffRef(i,n) = numext::imag(cc);
          }
@ -594,7 +576,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
            if ((vr == Scalar(0)) && (vi == Scalar(0)))
              vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));

-            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
+            ComplexScalar cc = ComplexScalar(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra) / ComplexScalar(vr,vi);
            m_matT.coeffRef(i,n-1) = numext::real(cc);
            m_matT.coeffRef(i,n) = numext::imag(cc);
            if (abs(x) > (abs(lastw) + abs(q)))
@ -604,7 +586,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
            }
            else
            {
-              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
+              cc = ComplexScalar(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n)) / ComplexScalar(lastw,q);
              m_matT.coeffRef(i+1,n-1) = numext::real(cc);
              m_matT.coeffRef(i+1,n) = numext::imag(cc);
            }
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@ -310,6 +310,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
  if (m_realQZ.info() == Success)
  {
    m_matS = m_realQZ.matrixS();
+    const MatrixType &matT = m_realQZ.matrixT();
    if (computeEigenvectors)
      m_eivec = m_realQZ.matrixZ().transpose();
  
@ -322,7 +323,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
      {
        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
-        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+        m_betas.coeffRef(i)  = matT.coeff(i,i);
        ++i;
      }
      else
@ -332,7 +333,9 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp

        // T =  [a 0]
        //      [0 b]
-        RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i+1, i+1);
+        RealScalar a = matT.diagonal().coeff(i),
+                   b = matT.diagonal().coeff(i+1);
+        // ^^ NOTE: using diagonal()(i) instead of coeff(i,i) workarounds a MSVC bug.
        Matrix<RealScalar,2,2> S2 = m_matS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();

        Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@ -236,7 +236,7 @@ template<typename _MatrixType> class RealSchur
    typedef Matrix<Scalar,3,1> Vector3s;

    Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu);
+    Index findSmallSubdiagEntry(Index iu, const Scalar& maxDiagEntry);
    void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
    void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
    void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@ -253,19 +253,25 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
  if (maxIters == -1)
    maxIters = m_maxIterationsPerRow * matrix.rows();

+  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+
  // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix.derived());
+  m_hess.compute(matrix.derived()/scale);

  // Step 2. Reduce to real Schur form  
  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+
+  m_matT *= scale;
  
  return *this;
 }
 template<typename MatrixType>
 template<typename HessMatrixType, typename OrthMatrixType>
 RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
-{  
-  m_matT = matrixH; 
+{
+  using std::abs;
+
+  m_matT = matrixH;
  if(computeU)
    m_matU = matrixQ;
  
@ -287,14 +293,18 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa

  if(norm!=0)
  {
+    Scalar maxDiagEntry = m_matT.cwiseAbs().diagonal().maxCoeff();
+
    while (iu >= 0)
    {
-      Index il = findSmallSubdiagEntry(iu);
+      Index il = findSmallSubdiagEntry(iu,maxDiagEntry);

      // Check for convergence
      if (il == iu) // One root found
      {
        m_matT.coeffRef(iu,iu) = m_matT.coeff(iu,iu) + exshift;
+        // keep track of the largest diagonal coefficient
+        maxDiagEntry = numext::maxi<Scalar>(maxDiagEntry,abs(m_matT.coeffRef(iu,iu)));
        if (iu > 0)
          m_matT.coeffRef(iu, iu-1) = Scalar(0);
        iu--;
@ -303,6 +313,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
      else if (il == iu-1) // Two roots found
      {
        splitOffTwoRows(iu, computeU, exshift);
+        // keep track of the largest diagonal coefficient
+        maxDiagEntry = numext::maxi<Scalar>(maxDiagEntry,numext::maxi(abs(m_matT.coeff(iu,iu)), abs(m_matT.coeff(iu-1,iu-1))));
        iu -= 2;
        iter = 0;
      }
@ -317,6 +329,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
        Index im;
        initFrancisQRStep(il, iu, shiftInfo, im, firstHouseholderVector);
        performFrancisQRStep(il, im, iu, computeU, firstHouseholderVector, workspace);
+        // keep track of the largest diagonal coefficient
+        maxDiagEntry = numext::maxi(maxDiagEntry,m_matT.cwiseAbs().diagonal().segment(im,iu-im).maxCoeff());
      }
    }
  }
@ -346,14 +360,13 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()

 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& maxDiagEntry)
 {
  using std::abs;
  Index res = iu;
  while (res > 0)
  {
-    Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
+    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * maxDiagEntry)
      break;
    res--;
  }
--- a/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
+++ b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
@ -25,21 +25,19 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
 ********************************************************************************
 */

-#ifndef EIGEN_REAL_SCHUR_MKL_H
-#define EIGEN_REAL_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_REAL_SCHUR_LAPACKE_H
+#define EIGEN_REAL_SCHUR_LAPACKE_H

 namespace Eigen { 

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SCHUR_REAL(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
@ -47,9 +45,9 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBas
  eigen_assert(matrix.cols() == matrix.rows()); \
 \
  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
  char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT2 select = 0; \
  jobvs = (computeU) ? 'V' : 'N'; \
  m_matU.resize(n, n); \
  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
@ -57,7 +55,7 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBas
  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
  Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
  wr.resize(n, 1); wi.resize(n, 1); \
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)wr.data(), (LAPACKE_TYPE*)wi.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
  if(info == 0) \
    m_info = Success; \
  else \
@ -69,11 +67,11 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBas
 \
 }

-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)

 } // end namespace Eigen

-#endif // EIGEN_REAL_SCHUR_MKL_H
+#endif // EIGEN_REAL_SCHUR_LAPACKE_H
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@ -492,11 +492,12 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
  
  typedef typename DiagType::RealScalar RealScalar;
  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
  
  while (end>0)
  {
    for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
+      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
        subdiag[i] = 0;

    // find the largest unreduced block
@ -739,14 +740,18 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
    EigenvectorsType& eivecs = solver.m_eivec;
    VectorType& eivals = solver.m_eivalues;
  
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = numext::maxi(scale,Scalar(1));
-    MatrixType scaledMat = mat / scale;
-    
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(2);
+    MatrixType scaledMat = mat;
+    scaledMat.coeffRef(0,1) = mat.coeff(1,0);
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > Scalar(0))
+      scaledMat /= scale;
+
    // Compute the eigenvalues
    computeRoots(scaledMat,eivals);
-    
+
    // compute the eigen vectors
    if(computeEigenvectors)
    {
@ -774,10 +779,11 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
        eivecs.col(0) << eivecs.col(1).unitOrthogonal();
      }
    }
-    
+
    // Rescale back to the original size.
    eivals *= scale;
-    
+    eivals.array() += shift;
+
    solver.m_info = Success;
    solver.m_isInitialized = true;
    solver.m_eigenvectorsOk = computeEigenvectors;
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@ -25,21 +25,19 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Self-adjoint eigenvalues/eigenvectors.
 ********************************************************************************
 */

-#ifndef EIGEN_SAEIGENSOLVER_MKL_H
-#define EIGEN_SAEIGENSOLVER_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H
+#define EIGEN_SAEIGENSOLVER_LAPACKE_H

 namespace Eigen { 

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \
 template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
@ -65,11 +63,11 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
  } \
 \
  lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
-  matrix_order=MKLCOLROW; \
+  matrix_order=LAPACKE_COLROW; \
  char jobz, uplo='L'/*, range='A'*/; \
  jobz = computeEigenvectors ? 'V' : 'N'; \
 \
-  info = LAPACKE_##MKLNAME( matrix_order, jobz, uplo, n, (MKLTYPE*)m_eivec.data(), lda, (MKLRTYPE*)m_eivalues.data() ); \
+  info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
  m_info = (info==0) ? Success : NoConvergence; \
  m_isInitialized = true; \
  m_eigenvectorsOk = computeEigenvectors; \
@ -77,15 +75,15 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
 }


-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)

-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)

 } // end namespace Eigen

--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@ -329,7 +329,7 @@ protected:

 // dense = homogeneous
 template< typename DstXprType, typename ArgType, typename Scalar>
-struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
 {
  typedef Homogeneous<ArgType,Vertical> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
@ -341,7 +341,7 @@ struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op

 // dense = homogeneous
 template< typename DstXprType, typename ArgType, typename Scalar>
-struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
 {
  typedef Homogeneous<ArgType,Horizontal> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@ -726,7 +726,7 @@ QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerive
  using std::acos;
  using std::sin;
  using std::abs;
-  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+  const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
  Scalar d = this->dot(other);
  Scalar absD = abs(d);

--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@ -44,6 +44,7 @@ public:
  typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
  typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
  typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
+  typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
  
  SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
    : m_dec(dec), m_rhs(rhs), m_guess(guess)
@ -81,7 +82,8 @@ struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
    : m_result(solve.rows(), solve.cols())
  {
    ::new (static_cast<Base*>(this)) Base(m_result);
-    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result, solve().guess());
+    m_result = solve.guess();
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
  }
  
 protected:  
@ -91,7 +93,7 @@ protected:
 // Specialization for "dst = dec.solveWithGuess(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
-struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef SolveWithGuess<DecType,RhsType,GuessType> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@ -85,7 +85,8 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
  using std::sqrt;
  using std::abs;
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  if(y == Scalar(0))
+  RealScalar deno = RealScalar(2)*abs(y);
+  if(deno < (std::numeric_limits<RealScalar>::min)())
  {
    m_c = Scalar(1);
    m_s = Scalar(0);
@ -93,7 +94,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
  }
  else
  {
-    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+    RealScalar tau = (x-z)/deno;
    RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
    RealScalar t;
    if(tau>RealScalar(0))
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -52,6 +52,8 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
  * \include class_FullPivLU.cpp
  * Output: \verbinclude class_FullPivLU.out
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
  */
 template<typename _MatrixType> class FullPivLU
@ -97,6 +99,15 @@ template<typename _MatrixType> class FullPivLU
    template<typename InputType>
    explicit FullPivLU(const EigenBase<InputType>& matrix);

+    /** \brief Constructs a LU factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivLU(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivLU(EigenBase<InputType>& matrix);
+
    /** Computes the LU decomposition of the given matrix.
      *
      * \param matrix the matrix of which to compute the LU decomposition.
@ -105,7 +116,11 @@ template<typename _MatrixType> class FullPivLU
      * \returns a reference to *this
      */
    template<typename InputType>
-    FullPivLU& compute(const EigenBase<InputType>& matrix);
+    FullPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      computeInPlace();
+      return *this;
+    }

    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
      * unit-lower-triangular part is L (at least for square matrices; in the non-square
@ -459,25 +474,28 @@ FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix)

 template<typename MatrixType>
 template<typename InputType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
+FullPivLU<MatrixType>::FullPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_q(matrix.cols()),
+    m_rowsTranspositions(matrix.rows()),
+    m_colsTranspositions(matrix.cols()),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
 {
-  check_template_parameters();
-
-  // the permutations are stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-
-  m_lu = matrix.derived();
-  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
-
  computeInPlace();
-
-  m_isInitialized = true;
-  return *this;
 }

 template<typename MatrixType>
 void FullPivLU<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
+  // the permutations are stored as int indices, so just to be sure:
+  eigen_assert(m_lu.rows()<=NumTraits<int>::highest() && m_lu.cols()<=NumTraits<int>::highest());
+
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+
  const Index size = m_lu.diagonalSize();
  const Index rows = m_lu.rows();
  const Index cols = m_lu.cols();
@ -557,6 +575,8 @@ void FullPivLU<MatrixType>::computeInPlace()
    m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));

  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+
+  m_isInitialized = true;
 }

 template<typename MatrixType>
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@ -26,6 +26,17 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
  };
 };

+template<typename T,typename Derived>
+struct enable_if_ref;
+// {
+//   typedef Derived type;
+// };
+
+template<typename T,typename Derived>
+struct enable_if_ref<Ref<T>,Derived> {
+  typedef Derived type;
+};
+
 } // end namespace internal

 /** \ingroup LU_Module
@ -57,6 +68,8 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
  *
  * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
  */
 template<typename _MatrixType> class PartialPivLU
@ -102,8 +115,22 @@ template<typename _MatrixType> class PartialPivLU
    template<typename InputType>
    explicit PartialPivLU(const EigenBase<InputType>& matrix);

+    /** Constructor for \link InplaceDecomposition inplace decomposition \endlink
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class FullPivLU instead.
+      */
    template<typename InputType>
-    PartialPivLU& compute(const EigenBase<InputType>& matrix);
+    explicit PartialPivLU(EigenBase<InputType>& matrix);
+
+    template<typename InputType>
+    PartialPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      compute();
+      return *this;
+    }

    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
      * unit-lower-triangular part is L (at least for square matrices; in the non-square
@ -251,6 +278,8 @@ template<typename _MatrixType> class PartialPivLU
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
    }

+    void compute();
+
    MatrixType m_lu;
    PermutationType m_p;
    TranspositionType m_rowsTranspositions;
@ -284,7 +313,7 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
 template<typename MatrixType>
 template<typename InputType>
 PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
-  : m_lu(matrix.rows(), matrix.rows()),
+  : m_lu(matrix.rows(),matrix.cols()),
    m_p(matrix.rows()),
    m_rowsTranspositions(matrix.rows()),
    m_l1_norm(0),
@ -294,6 +323,19 @@ PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
  compute(matrix.derived());
 }

+template<typename MatrixType>
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute();
+}
+
 namespace internal {

 /** \internal This is the blocked version of fullpivlu_unblocked() */
@ -470,19 +512,17 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
 } // end namespace internal

 template<typename MatrixType>
-template<typename InputType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
+void PartialPivLU<MatrixType>::compute()
 {
  check_template_parameters();

  // the row permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<NumTraits<int>::highest());
+  eigen_assert(m_lu.rows()<NumTraits<int>::highest());

-  m_lu = matrix.derived();
  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();

-  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
-  const Index size = matrix.rows();
+  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = m_lu.rows();

  m_rowsTranspositions.resize(size);

@ -493,7 +533,6 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
  m_p = m_rowsTranspositions;

  m_isInitialized = true;
-  return *this;
 }

 template<typename MatrixType>
--- a/Eigen/src/LU/PartialPivLU_LAPACKE.h
+++ b/Eigen/src/LU/PartialPivLU_LAPACKE.h
@ -25,7 +25,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
 ********************************************************************************
 */
@ -33,15 +33,13 @@
 #ifndef EIGEN_PARTIALLU_LAPACK_H
 #define EIGEN_PARTIALLU_LAPACK_H

-#include "Eigen/src/Core/util/MKL_support.h"
-
 namespace Eigen { 

 namespace internal {

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<int StorageOrder> \
 struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
 { \
@ -61,7 +59,7 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
    n = convert_index<lapack_int>(cols); \
    nb_transpositions = 0; \
 \
-    info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \
+    info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \
 \
    for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
 \
@ -73,10 +71,10 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
  } \
 };

-EIGEN_MKL_LU_PARTPIV(double, double, d)
-EIGEN_MKL_LU_PARTPIV(float, float, s)
-EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LU_PARTPIV(double, double, d)
+EIGEN_LAPACKE_LU_PARTPIV(float, float, s)
+EIGEN_LAPACKE_LU_PARTPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LU_PARTPIV(scomplex, lapack_complex_float,  c)

 } // end namespace internal

--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@ -41,6 +41,8 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  * This decomposition performs column pivoting in order to be rank-revealing and improve
  * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::colPivHouseholderQr()
  */
 template<typename _MatrixType> class ColPivHouseholderQR
@ -51,7 +53,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
@ -59,7 +60,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
    typedef typename MatrixType::RealScalar RealScalar;
    // FIXME should be int
    typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@ -135,6 +135,27 @@ template<typename _MatrixType> class ColPivHouseholderQR
      compute(matrix.derived());
    }

+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa ColPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
      * *this is the QR decomposition, if any exists.
      *
@ -453,21 +474,19 @@ template<typename MatrixType>
 template<typename InputType>
 ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  check_template_parameters();
-
-  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.cols()<=NumTraits<int>::highest());
-
-  m_qr = matrix;
-
+  m_qr = matrix.derived();
  computeInPlace();
-
  return *this;
 }

 template<typename MatrixType>
 void ColPivHouseholderQR<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
+  // the column permutation is stored as int indices, so just to be sure:
+  eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
+
  using std::abs;

  Index rows = m_qr.rows();
@ -598,7 +617,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 namespace internal {

 template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef ColPivHouseholderQR<MatrixType> QrType;
  typedef Inverse<QrType> SrcXprType;
--- a/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@ -25,22 +25,20 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Householder QR decomposition of a matrix with column pivoting based on
 *    LAPACKE_?geqp3 function.
 ********************************************************************************
 */

-#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H

 namespace Eigen { 

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
@ -66,9 +64,9 @@ ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynami
  m_colsPermutation.indices().setZero(); \
 \
  lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
-  lapack_int matrix_order = MKLCOLROW; \
-  LAPACKE_##MKLPREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
-                              (MKLTYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (MKLTYPE*)m_hCoeffs.data()); \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
+                              (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
  m_isInitialized = true; \
  m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
  m_hCoeffs.adjointInPlace(); \
@ -84,16 +82,16 @@ ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynami
  return *this; \
 }

-EIGEN_MKL_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, ColMajor, LAPACK_COL_MAJOR)

-EIGEN_MKL_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, RowMajor, LAPACK_ROW_MAJOR)

 } // end namespace Eigen

-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@ -29,16 +29,19 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
  *
  * \param MatrixType the type of the matrix of which we are computing the COD.
  *
-  * This class performs a rank-revealing complete ortogonal decomposition of a
+  * This class performs a rank-revealing complete orthogonal decomposition of a
  * matrix  \b A into matrices \b P, \b Q, \b T, and \b Z such that
  * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \begin{matrix} \mathbf{T} &
-  *  \mathbf{0} \\ \mathbf{0} & \mathbf{0} \end{matrix} \, \mathbf{Z}
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \,
+  *                     \begin{bmatrix} \mathbf{T} &  \mathbf{0} \\
+  *                                     \mathbf{0} & \mathbf{0} \end{bmatrix} \, \mathbf{Z}
  * \f]
  * by using Householder transformations. Here, \b P is a permutation matrix,
  * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of
  * size rank-by-rank. \b A may be rank deficient.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::completeOrthogonalDecomposition()
  */
 template <typename _MatrixType>
@ -48,16 +51,12 @@ class CompleteOrthogonalDecomposition {
  enum {
    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    Options = MatrixType::Options,
    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
  };
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
  typedef typename MatrixType::StorageIndex StorageIndex;
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options,
-                 MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-      MatrixQType;
  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
      PermutationType;
@ -114,12 +113,29 @@ class CompleteOrthogonalDecomposition {
  explicit CompleteOrthogonalDecomposition(const EigenBase<InputType>& matrix)
      : m_cpqr(matrix.rows(), matrix.cols()),
        m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
-        m_temp(matrix.cols()) {
+        m_temp(matrix.cols())
+  {
    compute(matrix.derived());
  }

+  /** \brief Constructs a complete orthogonal decomposition from a given matrix
+    *
+    * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+    *
+    * \sa CompleteOrthogonalDecomposition(const EigenBase&)
+    */
+  template<typename InputType>
+  explicit CompleteOrthogonalDecomposition(EigenBase<InputType>& matrix)
+    : m_cpqr(matrix.derived()),
+      m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+      m_temp(matrix.cols())
+  {
+    computeInPlace();
+  }
+
+
  /** This method computes the minimum-norm solution X to a least squares
-   * problem \f[\mathrm{minimize} ||A X - B|| \f], where \b A is the matrix of
+   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
   * which \c *this is the complete orthogonal decomposition.
   *
   * \param B the right-hand sides of the problem to solve.
@ -165,7 +181,12 @@ class CompleteOrthogonalDecomposition {
  const MatrixType& matrixT() const { return m_cpqr.matrixQR(); }

  template <typename InputType>
-  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix);
+  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix) {
+    // Compute the column pivoted QR factorization A P = Q R.
+    m_cpqr.compute(matrix);
+    computeInPlace();
+    return *this;
+  }

  /** \returns a const reference to the column permutation matrix */
  const PermutationType& colsPermutation() const {
@ -354,6 +375,8 @@ class CompleteOrthogonalDecomposition {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
  }

+  void computeInPlace();
+
  /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
   */
  template <typename Rhs>
@ -384,20 +407,16 @@ CompleteOrthogonalDecomposition<MatrixType>::logAbsDeterminant() const {
 * CompleteOrthogonalDecomposition(const MatrixType&)
 */
 template <typename MatrixType>
-template <typename InputType>
-CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
-    MatrixType>::compute(const EigenBase<InputType>& matrix) {
+void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+{
  check_template_parameters();

  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.cols() <= NumTraits<int>::highest());
-
-  // Compute the column pivoted QR factorization A P = Q R.
-  m_cpqr.compute(matrix);
+  eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());

  const Index rank = m_cpqr.rank();
-  const Index cols = matrix.cols();
-  const Index rows = matrix.rows();
+  const Index cols = m_cpqr.cols();
+  const Index rows = m_cpqr.rows();
  m_zCoeffs.resize((std::min)(rows, cols));
  m_temp.resize(cols);

@ -443,7 +462,6 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
      }
    }
  }
-  return *this;
 }

 template <typename MatrixType>
@ -509,12 +527,12 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(

 namespace internal {

-template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
 {
  typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
  typedef Inverse<CodType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
  {
    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
  }
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@ -50,6 +50,8 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
  * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
  * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
  * \sa MatrixBase::fullPivHouseholderQr()
  */
 template<typename _MatrixType> class FullPivHouseholderQR
@ -60,7 +62,6 @@ template<typename _MatrixType> class FullPivHouseholderQR
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
@ -135,6 +136,26 @@ template<typename _MatrixType> class FullPivHouseholderQR
      compute(matrix.derived());
    }

+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
      * \c *this is the QR decomposition.
      *
@ -430,18 +451,16 @@ template<typename MatrixType>
 template<typename InputType>
 FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  check_template_parameters();
-  
  m_qr = matrix.derived();
-  
  computeInPlace();
-  
  return *this;
 }

 template<typename MatrixType>
 void FullPivHouseholderQR<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
  using std::abs;
  Index rows = m_qr.rows();
  Index cols = m_qr.cols();
@ -560,7 +579,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 namespace internal {
  
 template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
  typedef FullPivHouseholderQR<MatrixType> QrType;
  typedef Inverse<QrType> SrcXprType;
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@ -37,6 +37,8 @@ namespace Eigen {
  * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
  * FullPivHouseholderQR or ColPivHouseholderQR.
  *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
  * \sa MatrixBase::householderQr()
  */
 template<typename _MatrixType> class HouseholderQR
@ -47,7 +49,6 @@ template<typename _MatrixType> class HouseholderQR
    enum {
      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
    };
@ -102,6 +103,24 @@ template<typename _MatrixType> class HouseholderQR
      compute(matrix.derived());
    }

+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa HouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit HouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_temp(matrix.cols()),
+        m_isInitialized(false)
+    {
+      computeInPlace();
+    }
+
    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
      * *this is the QR decomposition, if any exists.
      *
@ -151,7 +170,11 @@ template<typename _MatrixType> class HouseholderQR
    }

    template<typename InputType>
-    HouseholderQR& compute(const EigenBase<InputType>& matrix);
+    HouseholderQR& compute(const EigenBase<InputType>& matrix) {
+      m_qr = matrix.derived();
+      computeInPlace();
+      return *this;
+    }

    /** \returns the absolute value of the determinant of the matrix of which
      * *this is the QR decomposition. It has only linear complexity
@ -203,6 +226,8 @@ template<typename _MatrixType> class HouseholderQR
    {
      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
    }
+
+    void computeInPlace();
    
    MatrixType m_qr;
    HCoeffsType m_hCoeffs;
@ -354,16 +379,14 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
  * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
  */
 template<typename MatrixType>
-template<typename InputType>
-HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+void HouseholderQR<MatrixType>::computeInPlace()
 {
  check_template_parameters();
  
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
  Index size = (std::min)(rows,cols);

-  m_qr = matrix.derived();
  m_hCoeffs.resize(size);

  m_temp.resize(cols);
@ -371,7 +394,6 @@ HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const EigenBase<In
  internal::householder_qr_inplace_blocked<MatrixType, HCoeffsType>::run(m_qr, m_hCoeffs, 48, m_temp.data());

  m_isInitialized = true;
-  return *this;
 }

 #ifndef __CUDACC__
--- a/Eigen/src/QR/HouseholderQR_LAPACKE.h
+++ b/Eigen/src/QR/HouseholderQR_LAPACKE.h
@ -25,24 +25,22 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Householder QR decomposition of a matrix w/o pivoting based on
 *    LAPACKE_?geqrf function.
 ********************************************************************************
 */

-#ifndef EIGEN_QR_MKL_H
-#define EIGEN_QR_MKL_H
-
-#include "../Core/util/MKL_support.h"
+#ifndef EIGEN_QR_LAPACKE_H
+#define EIGEN_QR_LAPACKE_H

 namespace Eigen { 

 namespace internal {

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_QR_NOPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<typename MatrixQR, typename HCoeffs> \
 struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
 { \
@ -53,18 +51,18 @@ struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
    lapack_int n = (lapack_int) mat.cols(); \
    lapack_int lda = (lapack_int) mat.outerStride(); \
    lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
+    LAPACKE_##LAPACKE_PREFIX##geqrf( matrix_order, m, n, (LAPACKE_TYPE*)mat.data(), lda, (LAPACKE_TYPE*)hCoeffs.data()); \
    hCoeffs.adjointInPlace(); \
  } \
 };

-EIGEN_MKL_QR_NOPIV(double, double, d)
-EIGEN_MKL_QR_NOPIV(float, float, s)
-EIGEN_MKL_QR_NOPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_QR_NOPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_QR_NOPIV(double, double, d)
+EIGEN_LAPACKE_QR_NOPIV(float, float, s)
+EIGEN_LAPACKE_QR_NOPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_QR_NOPIV(scomplex, lapack_complex_float, c)

 } // end namespace internal

 } // end namespace Eigen

-#endif // EIGEN_QR_MKL_H
+#endif // EIGEN_QR_LAPACKE_H
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@ -1052,7 +1052,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
  
  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
  RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
-  RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -713,7 +713,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
        {
          finished = false;
          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          // the complex to real operation returns true is the updated 2x2 block is not already diagonal
+          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
          if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
          {
            JacobiRotation<RealScalar> j_left, j_right;
@ -727,7 +727,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
            if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);

            // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
          }
        }
      }
@ -738,9 +738,22 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig

  for(Index i = 0; i < m_diagSize; ++i)
  {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    m_singularValues.coeffRef(i) = a;
-    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    // For a complex matrix, some diagonal coefficients might note have been
+    // treated by svd_precondition_2x2_block_to_be_real, and the imaginary part
+    // of some diagonal entry might not be null.
+    if(NumTraits<Scalar>::IsComplex && abs(numext::imag(m_workMatrix.coeff(i,i)))>considerAsZero)
+    {
+      RealScalar a = abs(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU()) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    }
+    else
+    {
+      // m_workMatrix.coeff(i,i) is already real, no difficulty:
+      RealScalar a = numext::real(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU() && (a<RealScalar(0))) m_matrixU.col(i) = -m_matrixU.col(i);
+    }
  }
  
  m_singularValues *= scale;
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@ -25,21 +25,19 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
 *    Singular Value Decomposition - SVD.
 ********************************************************************************
 */

-#ifndef EIGEN_JACOBISVD_MKL_H
-#define EIGEN_JACOBISVD_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_JACOBISVD_LAPACKE_H
+#define EIGEN_JACOBISVD_LAPACKE_H

 namespace Eigen { 

-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */

-#define EIGEN_MKL_SVD(EIGTYPE, MKLTYPE, MKLRTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
 template<> inline \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>& \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
@ -52,41 +50,41 @@ JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPiv
  /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
  m_nonzeroSingularValues = m_diagSize; \
 \
-  lapack_int lda = matrix.outerStride(), ldu, ldvt; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
  char jobu, jobvt; \
-  MKLTYPE *u, *vt, dummy; \
+  LAPACKE_TYPE *u, *vt, dummy; \
  jobu  = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
  jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
  if (computeU()) { \
-    ldu  = m_matrixU.outerStride(); \
-    u    = (MKLTYPE*)m_matrixU.data(); \
+    ldu  = internal::convert_index<lapack_int>(m_matrixU.outerStride()); \
+    u    = (LAPACKE_TYPE*)m_matrixU.data(); \
  } else { ldu=1; u=&dummy; }\
  MatrixType localV; \
-  ldvt = (m_computeFullV) ? m_cols : (m_computeThinV) ? m_diagSize : 1; \
+  ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
  if (computeV()) { \
    localV.resize(ldvt, m_cols); \
-    vt   = (MKLTYPE*)localV.data(); \
+    vt   = (LAPACKE_TYPE*)localV.data(); \
  } else { ldvt=1; vt=&dummy; }\
-  Matrix<MKLRTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
+  Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
  MatrixType m_temp; m_temp = matrix; \
-  LAPACKE_##MKLPREFIX##gesvd( matrix_order, jobu, jobvt, m_rows, m_cols, (MKLTYPE*)m_temp.data(), lda, (MKLRTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
+  LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(m_rows), internal::convert_index<lapack_int>(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
  if (computeV()) m_matrixV = localV.adjoint(); \
 /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
  m_isInitialized = true; \
  return *this; \
 }

-EIGEN_MKL_SVD(double,   double,        double, d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, ColMajor, LAPACK_COL_MAJOR)

-EIGEN_MKL_SVD(double,   double,        double, d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, RowMajor, LAPACK_ROW_MAJOR)

 } // end namespace Eigen

-#endif // EIGEN_JACOBISVD_MKL_H
+#endif // EIGEN_JACOBISVD_LAPACKE_H
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@ -130,10 +130,9 @@ public:
  inline Index rank() const
  {
    using std::abs;
-    using std::max;
    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
    if(m_singularValues.size()==0) return 0;
-    RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
+    RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
    Index i = m_nonzeroSingularValues-1;
    while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
    return i+1;
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@ -124,8 +124,8 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
 }

 // Generic Sparse to Sparse assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 {
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
  {
@ -134,8 +134,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse, Scalar>
 };

 // Generic Sparse to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
 {
  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
@ -156,7 +156,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse>
 {
  typedef Solve<DecType,RhsType> SrcXprType;
  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
@ -169,10 +169,11 @@ struct Diagonal2Sparse {};

 template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };

-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef typename DstXprType::Scalar Scalar;
  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
  typedef Array<Scalar,Dynamic,1> ArrayXS;
  template<int Options>
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -504,6 +504,7 @@ template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
 : public EvalIterator
 {
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
  const XprType& m_block;
  Index m_end;
 public:
@ -528,6 +529,7 @@ public:
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
 {
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
  const unary_evaluator& m_eval;
  Index m_outerPos;
  Index m_innerIndex;
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@ -186,7 +186,7 @@ class SparseMapBase<Derived,WriteAccessors>
      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      StorageIndex* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
      const Index id = r - &Base::m_innerIndices[0];
      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
      return const_cast<Scalar*>(Base::m_values)[id];
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@ -45,7 +45,7 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>

  // dense += sparse * sparse
  template<typename Dest,typename ActualLhs>
-  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
  {
    typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
@ -57,7 +57,7 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>

  // dense -= sparse * sparse
  template<typename Dest>
-  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
  {
    addTo(dst, -lhs, rhs);
  }
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@ -218,8 +218,8 @@ struct SparseSelfAdjoint2Sparse {};
 template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
 template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };

-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
  template<typename DestScalar,int StorageOrder>
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@ -10,15 +10,16 @@
 #ifndef EIGEN_SUPERLUSUPPORT_H
 #define EIGEN_SUPERLUSUPPORT_H

-namespace Eigen { 
+namespace Eigen {

+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
    extern "C" {                                                                                          \
      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                void *, int, SuperMatrix *, SuperMatrix *,                                \
                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                     \
    }                                                                                                     \
    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@ -28,12 +29,37 @@ namespace Eigen {
         FLOATTYPE *recip_pivot_growth,                                                                   \
         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                                \
+    GlobalLU_t gLU;                                                                                       \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
+         ferr, berr, &gLU, &mem_usage, stats, info);                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+  }
+#else // version < 5.0
+#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
+    extern "C" {                                                                                          \
+      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
+                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
+                                void *, int, SuperMatrix *, SuperMatrix *,                                \
+                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
+                                mem_usage_t *, SuperLUStat_t *, int *);                                   \
+    }                                                                                                     \
+    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
+         SuperMatrix *U, void *work, int lwork,                                                           \
+         SuperMatrix *B, SuperMatrix *X,                                                                  \
+         FLOATTYPE *recip_pivot_growth,                                                                   \
+         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
+    mem_usage_t mem_usage;                                                                                \
    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
         ferr, berr, &mem_usage, stats, info);                                                            \
    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
  }
+#endif

 DECL_GSSVX(s,float,float)
 DECL_GSSVX(c,float,std::complex<float>)
--- a/Eigen/src/misc/RealSvd2x2.h
+++ b/Eigen/src/misc/RealSvd2x2.h
@ -28,7 +28,8 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
  JacobiRotation<RealScalar> rot1;
  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(d == RealScalar(0))
+
+  if(abs(d) < (std::numeric_limits<RealScalar>::min)())
  {
    rot1.s() = RealScalar(0);
    rot1.c() = RealScalar(1);
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
--- a/Eigen/src/misc/lapacke_mangling.h
+++ b/Eigen/src/misc/lapacke_mangling.h
@ -0,0 +1,17 @@
+#ifndef LAPACK_HEADER_INCLUDED
+#define LAPACK_HEADER_INCLUDED
+
+#ifndef LAPACK_GLOBAL
+#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
+#define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
+#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname
+#else
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#endif
+#endif
+
+#endif
+
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@ -329,6 +329,8 @@ operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 // NOTE disabled until we agree on argument order
 #if 0
 /** \cpp11 \returns an expression of the coefficient-wise polygamma function.
+  *
+  * \specialfunctions_module
  *
  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
  *
@ -345,6 +347,8 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
 #endif

 /** \returns an expression of the coefficient-wise zeta function.
+  *
+  * \specialfunctions_module
  *
  * It returns the Riemann zeta function of two arguments \c *this and \a q:
  *
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@ -22,10 +22,6 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
-typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
-typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
-typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
-typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
 typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
@ -40,7 +36,7 @@ typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFini
  * Example: \include Cwise_abs.cpp
  * Output: \verbinclude Cwise_abs.out
  *
-  * \sa abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
  */
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const AbsReturnType
@ -68,7 +64,7 @@ arg() const
  * Example: \include Cwise_abs2.cpp
  * Output: \verbinclude Cwise_abs2.out
  *
-  * \sa abs(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
  */
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const Abs2ReturnType
@ -85,7 +81,7 @@ abs2() const
  * Example: \include Cwise_exp.cpp
  * Output: \verbinclude Cwise_exp.out
  *
-  * \sa pow(), log(), sin(), cos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, pow(), log(), sin(), cos()
  */
 EIGEN_DEVICE_FUNC
 inline const ExpReturnType
@ -102,7 +98,7 @@ exp() const
  * Example: \include Cwise_log.cpp
  * Output: \verbinclude Cwise_log.out
  *
-  * \sa exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, exp()
  */
 EIGEN_DEVICE_FUNC
 inline const LogReturnType
@ -116,7 +112,7 @@ log() const
  * In exact arithmetic, \c x.log() is equivalent to \c (x+1).log(),
  * however, with finite precision, this function is much more accurate when \c x is close to zero.
  *
-  * \sa log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
  */
 EIGEN_DEVICE_FUNC
 inline const Log1pReturnType
@ -132,7 +128,7 @@ log1p() const
  * Example: \include Cwise_log10.cpp
  * Output: \verbinclude Cwise_log10.out
  *
-  * \sa log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
  */
 EIGEN_DEVICE_FUNC
 inline const Log10ReturnType
@ -149,7 +145,7 @@ log10() const
  * Example: \include Cwise_sqrt.cpp
  * Output: \verbinclude Cwise_sqrt.out
  *
-  * \sa pow(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
  */
 EIGEN_DEVICE_FUNC
 inline const SqrtReturnType
@ -199,7 +195,7 @@ sign() const
  * Example: \include Cwise_cos.cpp
  * Output: \verbinclude Cwise_cos.out
  *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
  */
 EIGEN_DEVICE_FUNC
 inline const CosReturnType
@ -217,7 +213,7 @@ cos() const
  * Example: \include Cwise_sin.cpp
  * Output: \verbinclude Cwise_sin.out
  *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
  */
 EIGEN_DEVICE_FUNC
 inline const SinReturnType
@ -231,7 +227,7 @@ sin() const
  * Example: \include Cwise_tan.cpp
  * Output: \verbinclude Cwise_tan.out
  *
-  * \sa cos(), sin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
  */
 EIGEN_DEVICE_FUNC
 inline const TanReturnType
@ -245,7 +241,7 @@ tan() const
  * Example: \include Cwise_atan.cpp
  * Output: \verbinclude Cwise_atan.out
  *
-  * \sa tan(), asin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
  */
 EIGEN_DEVICE_FUNC
 inline const AtanReturnType
@ -259,7 +255,7 @@ atan() const
  * Example: \include Cwise_acos.cpp
  * Output: \verbinclude Cwise_acos.out
  *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
  */
 EIGEN_DEVICE_FUNC
 inline const AcosReturnType
@ -273,7 +269,7 @@ acos() const
  * Example: \include Cwise_asin.cpp
  * Output: \verbinclude Cwise_asin.out
  *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
  */
 EIGEN_DEVICE_FUNC
 inline const AsinReturnType
@ -287,7 +283,7 @@ asin() const
  * Example: \include Cwise_tanh.cpp
  * Output: \verbinclude Cwise_tanh.out
  *
-  * \sa tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
  */
 EIGEN_DEVICE_FUNC
 inline const TanhReturnType
@ -301,7 +297,7 @@ tanh() const
  * Example: \include Cwise_sinh.cpp
  * Output: \verbinclude Cwise_sinh.out
  *
-  * \sa sin(), tanh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
  */
 EIGEN_DEVICE_FUNC
 inline const SinhReturnType
@ -315,7 +311,7 @@ sinh() const
  * Example: \include Cwise_cosh.cpp
  * Output: \verbinclude Cwise_cosh.out
  *
-  * \sa tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
  */
 EIGEN_DEVICE_FUNC
 inline const CoshReturnType
@ -324,77 +320,6 @@ cosh() const
  return CoshReturnType(derived());
 }

-/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
-  *
-  * Example: \include Cwise_lgamma.cpp
-  * Output: \verbinclude Cwise_lgamma.out
-  *
-  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
-  * type T to be supported.
-  *
-  * \sa digamma()
-  */
-EIGEN_DEVICE_FUNC
-inline const LgammaReturnType
-lgamma() const
-{
-  return LgammaReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
-  *
-  * \note This function supports only float and double scalar types. To support other scalar types,
-  * the user has to provide implementations of digamma(T) for any scalar
-  * type T to be supported.
-  *
-  * \sa Eigen::digamma(), Eigen::polygamma(), lgamma()
-  */
-EIGEN_DEVICE_FUNC
-inline const DigammaReturnType
-digamma() const
-{
-  return DigammaReturnType(derived());
-}
-
-/** \cpp11 \returns an expression of the coefficient-wise Gauss error
-  * function of *this.
-  *
-  * Example: \include Cwise_erf.cpp
-  * Output: \verbinclude Cwise_erf.out
-  *
-  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
-  * type T to be supported.
-  *
-  * \sa erfc()
-  */
-EIGEN_DEVICE_FUNC
-inline const ErfReturnType
-erf() const
-{
-  return ErfReturnType(derived());
-}
-
-/** \cpp11 \returns an expression of the coefficient-wise Complementary error
-  * function of *this.
-  *
-  * Example: \include Cwise_erfc.cpp
-  * Output: \verbinclude Cwise_erfc.out
-  *
-  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
-  * type T to be supported.
-  *
-  * \sa erf()
-  */
-EIGEN_DEVICE_FUNC
-inline const ErfcReturnType
-erfc() const
-{
-  return ErfcReturnType(derived());
-}
-
 /** \returns an expression of the coefficient-wise inverse of *this.
  *
  * Example: \include Cwise_inverse.cpp
@ -414,7 +339,7 @@ inverse() const
  * Example: \include Cwise_square.cpp
  * Output: \verbinclude Cwise_square.out
  *
-  * \sa operator/(), operator*(), abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
  */
 EIGEN_DEVICE_FUNC
 inline const SquareReturnType
@ -428,7 +353,7 @@ square() const
  * Example: \include Cwise_cube.cpp
  * Output: \verbinclude Cwise_cube.out
  *
-  * \sa square(), pow()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
  */
 EIGEN_DEVICE_FUNC
 inline const CubeReturnType
@ -442,7 +367,7 @@ cube() const
  * Example: \include Cwise_round.cpp
  * Output: \verbinclude Cwise_round.out
  *
-  * \sa ceil(), floor()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
  */
 EIGEN_DEVICE_FUNC
 inline const RoundReturnType
@ -456,7 +381,7 @@ round() const
  * Example: \include Cwise_floor.cpp
  * Output: \verbinclude Cwise_floor.out
  *
-  * \sa ceil(), round()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
  */
 EIGEN_DEVICE_FUNC
 inline const FloorReturnType
@ -470,7 +395,7 @@ floor() const
  * Example: \include Cwise_ceil.cpp
  * Output: \verbinclude Cwise_ceil.out
  *
-  * \sa floor(), round()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
  */
 EIGEN_DEVICE_FUNC
 inline const CeilReturnType
@ -538,3 +463,90 @@ operator!() const
                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
  return BooleanNotReturnType(derived());
 }
+
+
+// --- SpecialFunctions module ---
+
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+
+/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_lgamma.cpp
+  * Output: \verbinclude Cwise_lgamma.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of digamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(), Eigen::polygamma(), lgamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erf.cpp
+  * Output: \verbinclude Cwise_erf.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erfc.cpp
+  * Output: \verbinclude Cwise_erfc.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
+}
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@ -62,7 +62,7 @@ cast() const

 /** \returns an expression of the complex conjugate of \c *this.
  *
-  * \sa adjoint() */
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
  int rows = m.rows();
  int cols = m.cols();

-  int cost = 0;
+  double cost = 0;
  for (int j=0; j<rows; ++j)
  {
    int r = std::max(rows - j -1,0);
@ -78,10 +78,10 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
  else
    std::cout << "fixed ";
  std::cout << covMat.rows() << " \t"
-            << (timerNoSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerNoSqrt.value() << " MFLOPS)\t"
-            << (timerSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerSqrt.value() << " MFLOPS)\n";
+            << (timerNoSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerNoSqrt.best() << " GFLOPS)\t"
+            << (timerSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerSqrt.best() << " GFLOPS)\n";


  #ifdef BENCH_GSL
@ -119,13 +119,13 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)

 int main(int argc, char* argv[])
 {
-  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,0};
-  std::cout << "size            no sqrt                           standard";
+  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,1500,0};
+  std::cout << "size            LDLT                            LLT";
 //   #ifdef BENCH_GSL
 //   std::cout << "       GSL (standard + double + ATLAS)  ";
 //   #endif
  std::cout << "\n";
-  for (uint i=0; dynsizes[i]>0; ++i)
+  for (int i=0; dynsizes[i]>0; ++i)
    benchLLT(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));

  benchLLT(Matrix<Scalar,2,2>());
--- a/bench/dense_solvers.cpp
+++ b/bench/dense_solvers.cpp
@ -2,47 +2,74 @@
 #include "BenchTimer.h"
 #include <Eigen/Dense>
 #include <map>
+#include <vector>
 #include <string>
+#include <sstream>
 using namespace Eigen;

-std::map<std::string,Array<float,1,4> > results;
+std::map<std::string,Array<float,1,8,DontAlign|RowMajor> > results;
+std::vector<std::string> labels;
+std::vector<Array2i> sizes;
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute_norm_equation(Solver &solver, const MatrixType &A) {
+  if(A.rows()!=A.cols())
+    solver.compute(A.transpose()*A);
+  else
+    solver.compute(A);
+}
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute(Solver &solver, const MatrixType &A) {
+  solver.compute(A);
+}

 template<typename Scalar,int Size>
-void bench(int id, int size = Size)
+void bench(int id, int rows, int size = Size)
 {
-  typedef Matrix<Scalar,Size,Size> Mat;
-  Mat A(size,size);
+  typedef Matrix<Scalar,Dynamic,Size> Mat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatDyn;
+  typedef Matrix<Scalar,Size,Size> MatSquare;
+  Mat A(rows,size);
  A.setRandom();
-  A = A*A.adjoint();
+  if(rows==size)
+    A = A*A.adjoint();
  BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd;
+
+  int svd_opt = ComputeThinU|ComputeThinV;
  
-  int tries = 3;
+  int tries = 5;
  int rep = 1000/size;
  if(rep==0) rep = 1;
 //   rep = rep*rep;
  
-  LLT<Mat> llt(A);
-  LDLT<Mat> ldlt(A);
-  PartialPivLU<Mat> lu(A);
-  FullPivLU<Mat> fplu(A);
-  HouseholderQR<Mat> qr(A);
-  ColPivHouseholderQR<Mat> cpqr(A);
-  CompleteOrthogonalDecomposition<Mat> cod(A);
-  FullPivHouseholderQR<Mat> fpqr(A);
-  JacobiSVD<Mat> jsvd(A.rows(),A.cols());
-  BDCSVD<Mat> bdcsvd(A.rows(),A.cols());
+  LLT<MatSquare> llt(size);
+  LDLT<MatSquare> ldlt(size);
+  PartialPivLU<MatSquare> lu(size);
+  FullPivLU<MatSquare> fplu(size,size);
+  HouseholderQR<Mat> qr(A.rows(),A.cols());
+  ColPivHouseholderQR<Mat> cpqr(A.rows(),A.cols());
+  CompleteOrthogonalDecomposition<Mat> cod(A.rows(),A.cols());
+  FullPivHouseholderQR<Mat> fpqr(A.rows(),A.cols());
+  JacobiSVD<MatDyn> jsvd(A.rows(),A.cols());
+  BDCSVD<MatDyn> bdcsvd(A.rows(),A.cols());
  
-  BENCH(t_llt, tries, rep, llt.compute(A));
-  BENCH(t_ldlt, tries, rep, ldlt.compute(A));
-  BENCH(t_lu, tries, rep, lu.compute(A));
-  BENCH(t_fplu, tries, rep, fplu.compute(A));
-  BENCH(t_qr, tries, rep, qr.compute(A));
-  BENCH(t_cpqr, tries, rep, cpqr.compute(A));
-  BENCH(t_cod, tries, rep, cod.compute(A));
-  BENCH(t_fpqr, tries, rep, fpqr.compute(A));
+  BENCH(t_llt, tries, rep, compute_norm_equation(llt,A));
+  BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A));
+  BENCH(t_lu, tries, rep, compute_norm_equation(lu,A));
+  if(size<=1000)
+    BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A));
+  BENCH(t_qr, tries, rep, compute(qr,A));
+  BENCH(t_cpqr, tries, rep, compute(cpqr,A));
+  BENCH(t_cod, tries, rep, compute(cod,A));
+  if(size*rows<=10000000)
+    BENCH(t_fpqr, tries, rep, compute(fpqr,A));
  if(size<500) // JacobiSVD is really too slow for too large matrices
-    BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
-  BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV));
+    BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt));
+//   if(size*rows<=20000000)
+    BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt));
  
  results["LLT"][id] = t_llt.best();
  results["LDLT"][id] = t_ldlt.best();
@ -52,33 +79,108 @@ void bench(int id, int size = Size)
  results["ColPivHouseholderQR"][id] = t_cpqr.best();
  results["CompleteOrthogonalDecomposition"][id] = t_cod.best();
  results["FullPivHouseholderQR"][id] = t_fpqr.best();
-  results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
+  results["JacobiSVD"][id] = t_jsvd.best();
  results["BDCSVD"][id] = t_bdcsvd.best();
 }

+
 int main()
 {
+  labels.push_back("LLT");
+  labels.push_back("LDLT");
+  labels.push_back("PartialPivLU");
+  labels.push_back("FullPivLU");
+  labels.push_back("HouseholderQR");
+  labels.push_back("ColPivHouseholderQR");
+  labels.push_back("CompleteOrthogonalDecomposition");
+  labels.push_back("FullPivHouseholderQR");
+  labels.push_back("JacobiSVD");
+  labels.push_back("BDCSVD");
+
+  for(int i=0; i<labels.size(); ++i)
+    results[labels[i]].fill(-1);
+
  const int small = 8;
-  const int medium = 100;
-  const int large = 1000;
-  const int xl = 4000;
-  
-  bench<float,small>(0);
-  bench<float,Dynamic>(1,medium);
-  bench<float,Dynamic>(2,large);
-  bench<float,Dynamic>(3,xl);
-  
-  IOFormat fmt(3, 0, " \t", "\n", "", "");
-  
-  std::cout << "solver/size                           " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
-  std::cout << "LLT                             (ms)  " << (results["LLT"]/1000.).format(fmt) << "\n";
-  std::cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
+  sizes.push_back(Array2i(small,small));
+  sizes.push_back(Array2i(100,100));
+  sizes.push_back(Array2i(1000,1000));
+  sizes.push_back(Array2i(4000,4000));
+  sizes.push_back(Array2i(10000,small));
+  sizes.push_back(Array2i(10000,100));
+  sizes.push_back(Array2i(10000,1000));
+  sizes.push_back(Array2i(10000,4000));
+
+  using namespace std;
+
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    cout << sizes[k](0) << "x" << sizes[k](1) << "...\n";
+    bench<float,Dynamic>(k,sizes[k](0),sizes[k](1));
+  }
+
+  cout.width(32);
+  cout << "solver/size";
+  cout << "  ";
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    std::stringstream ss;
+    ss << sizes[k](0) << "x" << sizes[k](1);
+    cout.width(10); cout << ss.str(); cout << " ";
+  }
+  cout << endl;
+
+
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout.width(32); cout << labels[i]; cout << "  ";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      cout.width(10);
+      if(r(k)>=1e6)  cout << "-";
+      else           cout << r(k);
+      cout << " ";
+    }
+    cout << endl;
+  }
+
+  // HTML output
+  cout << "<table class=\"manual\">" << endl;
+  cout << "<tr><th>solver/size</th>" << endl;
+  for(int k=0; k<sizes.size(); ++k)
+    cout << "  <th>" << sizes[k](0) << "x" << sizes[k](1) << "</th>";
+  cout << "</tr>" << endl;
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout << "<tr";
+    if(i%2==1) cout << " class=\"alt\"";
+    cout << "><td>" << labels[i] << "</td>";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      if(r(k)>=1e6) cout << "<td>-</td>";
+      else
+      {
+        cout << "<td>" << r(k);
+        if(i>0)
+          cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")";
+        if(i<4 && sizes[k](0)!=sizes[k](1))
+          cout << " <sup><a href=\"#note_ls\">*</a></sup>";
+        cout << "</td>";
+      }
+    }
+    cout << "</tr>" << endl;
+  }
+  cout << "</table>" << endl;
+
+//   cout << "LLT                             (ms)  " << (results["LLT"]*1000.).format(fmt) << "\n";
+//   cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
 }
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@ -42,10 +42,20 @@ before-evaluators
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
-8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
 8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
 8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
 8985:d935df21a082   # Remove the rotating kernel.
-
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
--- a/bench/perf_monitoring/gemm/lazy_gemm.cpp
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@ -12,12 +12,13 @@ using namespace Eigen;
 typedef SCALAR Scalar;

 template<typename MatA, typename MatB, typename MatC>
-inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
 {
-  escape((void*)A.data());
-  escape((void*)B.data());
+//   escape((void*)A.data());
+//   escape((void*)B.data());
  C.noalias() += A.lazyProduct(B);
-  escape((void*)C.data());
+//   escape((void*)C.data());
 }

 template<int m, int n, int k, int TA>
--- a/bench/perf_monitoring/gemm/make_plot.sh
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@ -25,7 +25,7 @@ echo "set xtics rotate 1" >> $WHAT.gnuplot
 echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
 echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot

-col=`cat settings.txt | wc -l`
+col=`cat $bench"_settings.txt" | wc -l`
 echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
 echo " " >>  $WHAT.gnuplot

--- a/bench/perf_monitoring/gemm/run.sh
+++ b/bench/perf_monitoring/gemm/run.sh
@ -138,15 +138,15 @@ do
 done

 echo "Float:"
-cat $PREFIX"s"$bench.out"
-echo ""
+cat $PREFIX"s""$bench.out"
+echo " "

 echo "Double:"
-cat $PREFIX"d"$bench.out"
+cat $PREFIX"d""$bench.out"
 echo ""

 echo "Complex:"
-cat $PREFIX"c"$bench.out"
+cat $PREFIX"c""$bench.out"
 echo ""

 ./make_plot.sh $PREFIX"s"$bench $bench
--- a/cmake/FindSuperLU.cmake
+++ b/cmake/FindSuperLU.cmake
@ -17,7 +17,10 @@ find_path(SUPERLU_INCLUDES
  SRC
 )

-find_library(SUPERLU_LIBRARIES NAMES "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu" PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib)
+find_library(SUPERLU_LIBRARIES
+  NAMES "superlu_5.2.1" "superlu_5.2" "superlu_5.1.1" "superlu_5.1" "superlu_5.0" "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu"
+  PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR}
+  PATH_SUFFIXES lib)

 if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)

@ -48,11 +51,25 @@ int main() {
 }"
 SUPERLU_HAS_CLEAN_ENUMS)

-if(SUPERLU_HAS_CLEAN_ENUMS)
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main(void)
+{
+  GlobalLU_t glu;
+  return 0;
+}"
+SUPERLU_HAS_GLOBALLU_T)
+
+if(SUPERLU_HAS_GLOBALLU_T)
+  # at least 5.0
+  set(SUPERLU_VERSION_VAR "5.0")
+elseif(SUPERLU_HAS_CLEAN_ENUMS)
  # at least 4.3
  set(SUPERLU_VERSION_VAR "4.3")
 elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
-  # at least 4.3
+  # at least 4.0
  set(SUPERLU_VERSION_VAR "4.0")
 else()
  set(SUPERLU_VERSION_VAR "3.0")
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -78,6 +78,8 @@ add_custom_target(
  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/html/
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/AsciiQuickReference.txt          ${CMAKE_CURRENT_BINARY_DIR}/html/
  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
@ -88,6 +90,8 @@ add_custom_target(
  COMMAND ${CMAKE_COMMAND} -E make_directory ${Eigen_BINARY_DIR}/doc/html/unsupported
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
  WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc
 )

--- a/doc/CoeffwiseMathFunctionsTable.dox
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@ -3,11 +3,11 @@ namespace Eigen {
 /** \eigenManualPage CoeffwiseMathFunctions Catalog of coefficient-wise math functions


-<span style="font-size:300%; color:red; font-weight: 900;">!WORK IN PROGRESS!</span>
+<!-- <span style="font-size:300%; color:red; font-weight: 900;">!WORK IN PROGRESS!</span> -->

 This table presents a catalog of the coefficient-wise math functions supported by %Eigen.
 In this table, \c a, \c b, refer to Array objects or expressions, and \c m refers to a linear algebra Matrix/Vector object. Standard scalar types are abbreviated as follows:
-  - \c int: \c ui32
+  - \c int: \c i32
  - \c float: \c f
  - \c double: \c d
  - \c std::complex<float>: \c cf
@ -43,7 +43,35 @@ This also means that, unless specified, if the function \c std::foo is available
  using <a href="http://en.cppreference.com/w/cpp/numeric/math/fabs">std::abs</a>; \n
  abs(a[i]);
  </td>
-  <td>SSE2, AVX (ui32,f,d)</td>
+  <td>SSE2, AVX (i32,f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_inverse
+  a.\link ArrayBase::inverse inverse\endlink(); \n
+  \link Eigen::inverse inverse\endlink(a); \n
+  m.\link MatrixBase::cwiseInverse cwiseInverse\endlink();
+  </td>
+  <td>inverse value (\f$ 1/a_i \f$) </td>
+  <td class="code">
+  1/a[i];
+  </td>
+  <td>All engines (f,d,fc,fd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_conj
+  a.\link ArrayBase::conjugate conjugate\endlink(); \n
+  \link Eigen::conj conj\endlink(a); \n
+  m.\link MatrixBase::conjugate conjugate();
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Complex_conjugate">complex conjugate</a> (\f$ \bar{a_i} \f$),\n
+  no-op for real </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/conj">std::conj</a>; \n
+  conj(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
 </tr>
 <tr>
 <th colspan="4">Exponential functions</th>
@ -67,12 +95,12 @@ This also means that, unless specified, if the function \c std::foo is available
  a.\link ArrayBase::log log\endlink(); \n
  \link Eigen::log log\endlink(a);
  </td>
-  <td>natural (base \f$ e \f$) logarithm (\f$ ln({a_i}) \f$)</td>
+  <td>natural (base \f$ e \f$) logarithm (\f$ \ln({a_i}) \f$)</td>
  <td class="code">
  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log">std::log</a>; \n
  log(a[i]);
  </td>
-  <td>SSE2, AVX (f,d)</td>
+  <td>SSE2, AVX (f)</td>
 </tr>
 <tr>
  <td class="code">
@ -80,23 +108,298 @@ This also means that, unless specified, if the function \c std::foo is available
  a.\link ArrayBase::log1p log1p\endlink(); \n
  \link Eigen::log1p log1p\endlink(a);
  </td>
-  <td>natural (base \f$ e \f$) logarithm of 1 plus \n the given number (\f$ ln({1+a_i}) \f$)</td>
+  <td>natural (base \f$ e \f$) logarithm of 1 plus \n the given number (\f$ \ln({1+a_i}) \f$)</td>
  <td>built-in generic implementation based on \c log,\n
  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/log1p">\c std::log1p </a>; \cpp11</td>
  <td></td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log10
+  a.\link ArrayBase::log10 log10\endlink(); \n
+  \link Eigen::log10 log10\endlink(a);
+  </td>
+  <td>base 10 logarithm (\f$ \log_{10}({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log10">std::log10</a>; \n
+  log10(a[i]);
+  </td>
+  <td></td>
+</tr>
 <tr>
 <th colspan="4">Power functions</th>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_pow
+  a.\link ArrayBase::pow pow\endlink(b); \n
+  \link Eigen::pow pow\endlink(a,b);
+  </td>
+  <td>raises a number to the given power (\f$ a_i ^ {b_i} \f$) \n \c a and \c b can be either an array or scalar.</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/pow">std::pow</a>; \n
+  pow(a[i],b[i]);\n
+  (plus builtin for integer types)</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sqrt
+  a.\link ArrayBase::sqrt sqrt\endlink(); \n
+  \link Eigen::sqrt sqrt\endlink(a);\n
+  m.\link MatrixBase::cwiseSqrt cwiseSqrt\endlink();
+  </td>
+  <td>computes square root (\f$ \sqrt a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  sqrt(a[i]);</td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rsqrt
+  a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
+  \link Eigen::rsqrt rsqrt\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Fast_inverse_square_root">reciprocal square root</a> (\f$ 1/{\sqrt a_i} \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  1/sqrt(a[i]); \n
+  </td>
+  <td>SSE2, AVX, AltiVec, ZVector (f,d)\n
+  (approx + 1 Newton iteration)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_square
+  a.\link ArrayBase::square square\endlink(); \n
+  \link Eigen::square square\endlink(a);
+  </td>
+  <td>computes square power (\f$ a_i^2 \f$)</td>
+  <td class="code">
+  a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cube
+  a.\link ArrayBase::cube cube\endlink(); \n
+  \link Eigen::cube cube\endlink(a);
+  </td>
+  <td>computes cubic power (\f$ a_i^3 \f$)</td>
+  <td class="code">
+  a[i]*a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs2
+  a.\link ArrayBase::abs2 abs2\endlink(); \n
+  \link Eigen::abs2 abs2\endlink(a);\n
+  m.\link MatrixBase::cwiseAbs2 cwiseAbs2\endlink();
+  </td>
+  <td>computes the squared absolute value (\f$ |a_i|^2 \f$)</td>
+  <td class="code">
+  real:    a[i]*a[i] \n
+  complex:  real(a[i])*real(a[i]) \n
+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; + imag(a[i])*imag(a[i])</td>
+  <td>All (i32,f,d)</td>
+</tr>
 <tr>
 <th colspan="4">Trigonometric functions</th>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sin
+  a.\link ArrayBase::sin sin\endlink(); \n
+  \link Eigen::sin sin\endlink(a);
+  </td>
+  <td>computes sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sin">std::sin</a>; \n
+  sin(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cos
+  a.\link ArrayBase::cos cos\endlink(); \n
+  \link Eigen::cos cos\endlink(a);
+  </td>
+  <td>computes cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cos">std::cos</a>; \n
+  cos(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tan
+  a.\link ArrayBase::tan tan\endlink(); \n
+  \link Eigen::tan tan\endlink(a);
+  </td>
+  <td>computes tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tan">std::tan</a>; \n
+  tan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asin
+  a.\link ArrayBase::asin asin\endlink(); \n
+  \link Eigen::asin asin\endlink(a);
+  </td>
+  <td>computes arc sine (\f$ \sin^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asin">std::asin</a>; \n
+  asin(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acos
+  a.\link ArrayBase::acos acos\endlink(); \n
+  \link Eigen::acos acos\endlink(a);
+  </td>
+  <td>computes arc cosine  (\f$ \cos^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acos">std::acos</a>; \n
+  acos(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atan
+  a.\link ArrayBase::atan tan\endlink(); \n
+  \link Eigen::atan atan\endlink(a);
+  </td>
+  <td>computes arc tangent (\f$ \tan^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atan">std::atan</a>; \n
+  atan(a[i]);</td>
+  <td></td>
+</tr>
 <tr>
 <th colspan="4">Hyperbolic functions</th>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sinh
+  a.\link ArrayBase::sinh sinh\endlink(); \n
+  \link Eigen::sinh sinh\endlink(a);
+  </td>
+  <td>computes hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sinh">std::sinh</a>; \n
+  sinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cosh
+  a.\link ArrayBase::cosh cohs\endlink(); \n
+  \link Eigen::cosh cosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cosh">std::cosh</a>; \n
+  cosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tanh
+  a.\link ArrayBase::tanh tanh\endlink(); \n
+  \link Eigen::tanh tanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tanh">std::tanh</a>; \n
+  tanh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Nearest integer floating point operations</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ceil
+  a.\link ArrayBase::ceil ceil\endlink(); \n
+  \link Eigen::ceil ceil\endlink(a);
+  </td>
+  <td>nearest integer not less than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/ceil">std::ceil</a>; \n
+  ceil(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_floor
+  a.\link ArrayBase::floor floor\endlink(); \n
+  \link Eigen::floor floor\endlink(a);
+  </td>
+  <td>nearest integer not greater than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/floor">std::floor</a>; \n
+  floor(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_round
+  a.\link ArrayBase::round round\endlink(); \n
+  \link Eigen::round round\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding away from zero in halfway cases</td>
+  <td>built-in generic implementation \n based on \c floor and \c ceil,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/round">\c std::round </a>; \cpp11</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Floating point manipulation functions</th>
+</tr>
+<tr>
+<th colspan="4">Classification and comparison</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isfinite
+  a.\link ArrayBase::isfinite isfinite\endlink(); \n
+  \link Eigen::isfinite isfinite\endlink(a);
+  </td>
+  <td>checks if the given number has finite value</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isfinite">\c std::isfinite </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isinf
+  a.\link ArrayBase::isinf isinf\endlink(); \n
+  \link Eigen::isinf isinf\endlink(a);
+  </td>
+  <td>checks if the given number is infinite</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isinf">\c std::isinf </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isnan
+  a.\link ArrayBase::isnan isnan\endlink(); \n
+  \link Eigen::isnan isnan\endlink(a);
+  </td>
+  <td>checks if the given number is not a number</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isnan">\c std::isnan </a>; \cpp11</td>
+  <td></td>
+</tr>
 <tr>
 <th colspan="4">Error and gamma functions</th>
 </tr>
+<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
 <tr>
  <td class="code">
  \anchor cwisetable_erf
@ -124,24 +427,89 @@ This also means that, unless specified, if the function \c std::foo is available
  <td></td>
 </tr>
 <tr>
-<th colspan="4">Nearest integer floating point operations</th>
+  <td class="code">
+  \anchor cwisetable_lgamma
+  a.\link ArrayBase::lgamma lgamma\endlink(); \n
+  \link Eigen::lgamma lgamma\endlink(a);
+  </td>
+  <td>natural logarithm of the gamma function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/lgamma">std::lgamma</a>; \cpp11 \n
+  lgamma(a[i]);
+  </td>
+  <td></td>
 </tr>
 <tr>
-<th colspan="4">Floating point manipulation functions</th>
+  <td class="code">
+  \anchor cwisetable_digamma
+  a.\link ArrayBase::digamma digamma\endlink(); \n
+  \link Eigen::digamma digamma\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Digamma_function">logarithmic derivative of the gamma function</a></td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
 </tr>
 <tr>
-<th colspan="4">Classification and comparison</th>
+  <td class="code">
+  \anchor cwisetable_igamma
+  \link Eigen::igamma igamma\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">lower incomplete gamma integral</a>
+  \n \f$ \gamma(a_i,x_i)= \frac{1}{|a_i|} \int_{0}^{x_i}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
 </tr>
 <tr>
-<th colspan="4">Miscellaneous</th>
+  <td class="code">
+  \anchor cwisetable_igammac
+  \link Eigen::igammac igammac\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">upper incomplete gamma integral</a>
+  \n \f$ \Gamma(a_i,x_i) = \frac{1}{|a_i|} \int_{x_i}^{\infty}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Special functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_polygamma
+  \link Eigen::polygamma polygamma\endlink(n,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Polygamma_function">n-th derivative of digamma at x</a></td>
+  <td>
+  built-in generic based on\n <a href="#cwisetable_lgamma">\c lgamma </a>,
+  <a href="#cwisetable_digamma"> \c digamma </a>
+  and <a href="#cwisetable_zeta">\c zeta </a>.
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_betainc
+  \link Eigen::betainc betainc\endlink(a,b,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">Incomplete beta function</a></td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
 </tr>
 <tr>
  <td class="code">
  \anchor cwisetable_zeta
-  a.\link ArrayBase::zeta zeta\endlink(b); \n
  \link Eigen::zeta zeta\endlink(a,b);
  </td>
-  <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a> \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
+  <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a>
+  \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
  <td>
  built-in for float and double
  </td>
--- a/doc/CustomizingEigen.dox
+++ b/doc/CustomizingEigen.dox
@ -165,8 +165,7 @@ This other example adds support for the \c mpq_class type from <a href="https://
 #include <boost/operators.hpp>

 namespace Eigen {
-  template<class> struct NumTraits;
-  template<> struct NumTraits<mpq_class>
+  template<> struct NumTraits<mpq_class> : GenericNumTraits<mpq_class>
  {
    typedef mpq_class Real;
    typedef mpq_class NonInteger;
@ -174,6 +173,7 @@ namespace Eigen {

    static inline Real epsilon() { return 0; }
    static inline Real dummy_precision() { return 0; }
+    static inline Real digits10() { return 0; }

    enum {
      IsInteger = 0,
@ -187,31 +187,25 @@ namespace Eigen {
  };

  namespace internal {
-    template<>
-      struct significant_decimals_impl<mpq_class>
-      {
-	// Infinite precision when printing
-	static inline int run() { return 0; }
-      };

    template<> struct scalar_score_coeff_op<mpq_class> {
      struct result_type : boost::totally_ordered1<result_type> {
-	std::size_t len;
-	result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
-	result_type(mpq_class const& q) :
-	  len(mpz_size(q.get_num_mpz_t())+
-	      mpz_size(q.get_den_mpz_t())-1) {}
-	friend bool operator<(result_type x, result_type y) {
-	  // 0 is the worst possible pivot
-	  if (x.len == 0) return y.len > 0;
-	  if (y.len == 0) return false;
-	  // Prefer a pivot with a small representation
-	  return x.len > y.len;
-	}
-	friend bool operator==(result_type x, result_type y) {
-	  // Only used to test if the score is 0
-	  return x.len == y.len;
-	}
+        std::size_t len;
+        result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+        result_type(mpq_class const& q) :
+          len(mpz_size(q.get_num_mpz_t())+
+              mpz_size(q.get_den_mpz_t())-1) {}
+        friend bool operator<(result_type x, result_type y) {
+          // 0 is the worst possible pivot
+          if (x.len == 0) return y.len > 0;
+          if (y.len == 0) return false;
+          // Prefer a pivot with a small representation
+          return x.len > y.len;
+        }
+        friend bool operator==(result_type x, result_type y) {
+          // Only used to test if the score is 0
+          return x.len == y.len;
+        }
      };
      result_type operator()(mpq_class const& x) const { return x; }
    };
--- a/doc/DenseDecompositionBenchmark.dox
+++ b/doc/DenseDecompositionBenchmark.dox
@ -0,0 +1,42 @@
+namespace Eigen {
+
+/** \eigenManualPage DenseDecompositionBenchmark Benchmark of dense decompositions
+
+This page presents a speed comparison of the dense matrix decompositions offered by %Eigen for a wide range of square matrices and overconstrained problems.
+
+For a more general overview on the features and numerical robustness of linear solvers and decompositions, check this \link TopicLinearAlgebraDecompositions table \endlink.
+
+This benchmark has been run on a laptop equipped with an Intel core i7 \@ 2,6 GHz, and compiled with clang with \b AVX and \b FMA instruction sets enabled but without multi-threading.
+It uses \b single \b precision \b float numbers. For double, you can get a good estimate by multiplying the timings by a factor 2.
+
+The square matrices are symmetric, and for the overconstrained matrices, the reported timmings include the cost to compute the symmetric covariance matrix \f$ A^T A \f$ for the first four solvers based on Cholesky and LU, as denoted by the \b * symbol (top-right corner part of the table).
+Timings are in \b milliseconds, and factors are relative to the LLT decomposition which is the fastest but also the least general and robust.
+
+<table class="manual">
+<tr><th>solver/size</th>
+  <th>8x8</th>  <th>100x100</th>  <th>1000x1000</th>  <th>4000x4000</th>  <th>10000x8</th>  <th>10000x100</th>  <th>10000x1000</th>  <th>10000x4000</th></tr>
+<tr><td>LLT</td><td>0.05</td><td>0.42</td><td>5.83</td><td>374.55</td><td>6.79 <sup><a href="#note_ls">*</a></sup></td><td>30.15 <sup><a href="#note_ls">*</a></sup></td><td>236.34 <sup><a href="#note_ls">*</a></sup></td><td>3847.17 <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>LDLT</td><td>0.07 (x1.3)</td><td>0.65 (x1.5)</td><td>26.86 (x4.6)</td><td>2361.18 (x6.3)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.91 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>252.61 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>5807.66 (x1.5) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr><td>PartialPivLU</td><td>0.08 (x1.5)</td><td>0.69 (x1.6)</td><td>15.63 (x2.7)</td><td>709.32 (x1.9)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.32 (x1) <sup><a href="#note_ls">*</a></sup></td><td>241.68 (x1) <sup><a href="#note_ls">*</a></sup></td><td>4270.48 (x1.1) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>FullPivLU</td><td>0.1 (x1.9)</td><td>4.48 (x10.6)</td><td>281.33 (x48.2)</td><td>-</td><td>6.83 (x1) <sup><a href="#note_ls">*</a></sup></td><td>32.67 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>498.25 (x2.1) <sup><a href="#note_ls">*</a></sup></td><td>-</td></tr>
+<tr><td>HouseholderQR</td><td>0.19 (x3.5)</td><td>2.18 (x5.2)</td><td>23.42 (x4)</td><td>1337.52 (x3.6)</td><td>34.26 (x5)</td><td>129.01 (x4.3)</td><td>377.37 (x1.6)</td><td>4839.1 (x1.3)</td></tr>
+<tr class="alt"><td>ColPivHouseholderQR</td><td>0.23 (x4.3)</td><td>2.23 (x5.3)</td><td>103.34 (x17.7)</td><td>9987.16 (x26.7)</td><td>36.05 (x5.3)</td><td>163.18 (x5.4)</td><td>2354.08 (x10)</td><td>37860.5 (x9.8)</td></tr>
+<tr><td>CompleteOrthogonalDecomposition</td><td>0.23 (x4.3)</td><td>2.22 (x5.2)</td><td>99.44 (x17.1)</td><td>10555.3 (x28.2)</td><td>35.75 (x5.3)</td><td>169.39 (x5.6)</td><td>2150.56 (x9.1)</td><td>36981.8 (x9.6)</td></tr>
+<tr class="alt"><td>FullPivHouseholderQR</td><td>0.23 (x4.3)</td><td>4.64 (x11)</td><td>289.1 (x49.6)</td><td>-</td><td>69.38 (x10.2)</td><td>446.73 (x14.8)</td><td>4852.12 (x20.5)</td><td>-</td></tr>
+<tr><td>JacobiSVD</td><td>1.01 (x18.6)</td><td>71.43 (x168.4)</td><td>-</td><td>-</td><td>113.81 (x16.7)</td><td>1179.66 (x39.1)</td><td>-</td><td>-</td></tr>
+<tr class="alt"><td>BDCSVD</td><td>1.07 (x19.7)</td><td>21.83 (x51.5)</td><td>331.77 (x56.9)</td><td>18587.9 (x49.6)</td><td>110.53 (x16.3)</td><td>397.67 (x13.2)</td><td>2975 (x12.6)</td><td>48593.2 (x12.6)</td></tr>
+</table>
+
+<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
+
+\b Observations:
+ + LLT is always the fastest solvers.
+ + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+
+The above table has been generated by the <a href="https://bitbucket.org/eigen/eigen/raw/default/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+
+*/
+
+}
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -216,6 +216,7 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                         "lu_module=This is defined in the %LU module. \code #include <Eigen/LU> \endcode" \
                         "qr_module=This is defined in the %QR module. \code #include <Eigen/QR> \endcode" \
                         "svd_module=This is defined in the %SVD module. \code #include <Eigen/SVD> \endcode" \
+                         "specialfunctions_module=This is defined in the \b unsupported SpecialFunctions module. \code #include <Eigen/SpecialFunctions> \endcode" \
                         "label=\bug" \
                         "matrixworld=<a href='#matrixonly' style='color:green;text-decoration: none;'>*</a>" \
                         "arrayworld=<a href='#arrayonly' style='color:blue;text-decoration: none;'>*</a>" \
--- a/doc/InplaceDecomposition.dox
+++ b/doc/InplaceDecomposition.dox
@ -0,0 +1,115 @@
+namespace Eigen {
+
+/** \eigenManualPage InplaceDecomposition Inplace matrix decompositions
+
+Starting from %Eigen 3.3, the LU, Cholesky, and QR decompositions can operate \em inplace, that is, directly within the given input matrix.
+This feature is especially useful when dealing with huge matrices, and or when the available memory is very limited (embedded systems).
+
+To this end, the respective decomposition class must be instantiated with a Ref<> matrix type, and the decomposition object must be constructed with the input matrix as argument. As an example, let us consider an inplace LU decomposition with partial pivoting.
+
+Let's start with the basic inclusions, and declaration of a 2x2 matrix \c A:
+
+<table class="example">
+<tr><th>code</th><th>output</th></tr>
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp init
+  </td>
+  <td>\snippet TutorialInplaceLU.out init
+  </td>
+</tr>
+</table>
+
+No surprise here! Then, let's declare our inplace LU object \c lu, and check the content of the matrix \c A:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp declaration
+  </td>
+  <td>\snippet TutorialInplaceLU.out declaration
+  </td>
+</tr>
+</table>
+
+Here, the \c lu object computes and stores the \c L and \c U factors within the memory held by the matrix \c A.
+The coefficients of \c A have thus been destroyed during the factorization, and replaced by the L and U factors as one can verify:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp matrixLU
+  </td>
+  <td>\snippet TutorialInplaceLU.out matrixLU
+  </td>
+</tr>
+</table>
+
+Then, one can use the \c lu object as usual, for instance to solve the Ax=b problem:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp solve
+  </td>
+  <td>\snippet TutorialInplaceLU.out solve
+  </td>
+</tr>
+</table>
+
+Here, since the content of the original matrix \c A has been lost, we had to declared a new matrix \c A0 to verify the result.
+
+Since the memory is shared between \c A and \c lu, modifying the matrix \c A will make \c lu invalid.
+This can easily be verified by modifying the content of \c A and trying to solve the initial problem again:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp modifyA
+  </td>
+  <td>\snippet TutorialInplaceLU.out modifyA
+  </td>
+</tr>
+</table>
+
+Note that there is no shared pointer under the hood, it is the \b responsibility \b of \b the \b user to keep the input matrix \c A in life as long as \c lu is living.
+
+If one wants to update the factorization with the modified A, one has to call the compute method as usual:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute
+  </td>
+</tr>
+</table>
+
+Note that calling compute does not change the memory which is referenced by the \c lu object. Therefore, if the compute method is called with another matrix \c A1 different than \c A, then the content of \c A1 won't be modified. This is still the content of \c A that will be used to store the L and U factors of the matrix \c A1.
+This can easily be verified as follows:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis0
+ </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis0
+ </td>
+</tr>
+</table>
+The matrix \c A1 is unchanged, and one can thus solve A1*x=b, and directly check the residual without any copy of \c A1:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis1
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis1
+ </td>
+</tr>
+</table>
+
+
+Here is the list of matrix decompositions supporting this inplace mechanism:
+
+- class LLT
+- class LDLT
+- class PartialPivLU
+- class FullPivLU
+- class HouseholderQR
+- class ColPivHouseholderQR
+- class FullPivHouseholderQR
+- class CompleteOrthogonalDecomposition
+
+*/
+
+}
--- a/doc/Manual.dox
+++ b/doc/Manual.dox
@ -10,6 +10,7 @@ namespace Eigen {
  - \subpage TopicAssertions
  - \subpage TopicCustomizingEigen
  - \subpage TopicMultiThreading
+  - \subpage TopicUsingBlasLapack
  - \subpage TopicUsingIntelMKL
  - \subpage TopicCUDA
  - \subpage TopicPitfalls
@ -106,6 +107,10 @@ namespace Eigen {
    \ingroup DenseLinearSolvers_chapter */
 /** \addtogroup LeastSquares 
    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup InplaceDecomposition
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup DenseDecompositionBenchmark
+    \ingroup DenseLinearSolvers_chapter */

 /** \addtogroup DenseLinearSolvers_Reference
    \ingroup DenseLinearSolvers_chapter */
--- a/doc/MatrixfreeSolverExample.dox
+++ b/doc/MatrixfreeSolverExample.dox
@ -6,12 +6,12 @@ namespace Eigen {
 \eigenManualPage MatrixfreeSolverExample Matrix-free solvers

 Iterative solvers such as ConjugateGradient and BiCGSTAB can be used in a matrix free context. To this end, user must provide a wrapper class inheriting EigenBase<> and implementing the following methods:
- - Index rows() and Index cols(): returns number of rows and columns respectively
- - operator* with and %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
+ - \c Index \c rows() and \c Index \c cols(): returns number of rows and columns respectively
+ - \c operator* with your type and an %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)

-Eigen::internal::traits<> must also be specialized for the wrapper type.
+\c Eigen::internal::traits<> must also be specialized for the wrapper type.

-Here is a complete example wrapping a Eigen::SparseMatrix:
+Here is a complete example wrapping an Eigen::SparseMatrix:
 \include matrixfree_cg.cpp
 Output: \verbinclude matrixfree_cg.out

--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@ -76,6 +76,9 @@ They are summarized in the following tables:
 <tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
    <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
    <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+<tr><td>PardisoLLT \n PardisoLDLT \n PardisoLU</td><td>\link PardisoSupport_Module PardisoSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
+    <td>Requires the <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel MKL</a> package, \b Proprietary </td>
+    <td>optimized for tough problems patterns, see also \link TopicUsingIntelMKL using MKL with Eigen \endlink</td></tr>
 </table>

 Here \c SPD means symmetric positive definite.
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@ -4,6 +4,7 @@ namespace Eigen {

 This page presents a catalogue of the dense matrix decompositions offered by Eigen.
 For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink.
+To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink.

 \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen

@ -256,6 +257,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor
    <dd></dd>
 </dl>

+
 */

 }
--- a/doc/UsingBlasLapackBackends.dox
+++ b/doc/UsingBlasLapackBackends.dox
@ -0,0 +1,133 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2011-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Documentation on the use of BLAS/LAPACK libraries through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingBlasLapack Using BLAS/LAPACK from %Eigen
+
+
+Since %Eigen version 3.3 and later, any F77 compatible BLAS or LAPACK libraries can be used as backends for dense matrix products and dense matrix decompositions.
+For instance, one can use <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel® MKL</a>, Apple's Accelerate framework on OSX, <a href="http://www.openblas.net/">OpenBLAS</a>, <a href="http://www.netlib.org/lapack">Netlib LAPACK</a>, etc.
+
+Do not miss this \link TopicUsingIntelMKL page \endlink for further discussions on the specific use of Intel® MKL (also includes VML, PARDISO, etc.)
+
+In order to use an external BLAS and/or LAPACK library, you must link you own application to the respective libraries and their dependencies.
+For LAPACK, you must also link to the standard <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> library, which is used as a convenient think layer between %Eigen's C++ code and LAPACK F77 interface. Then you must activate their usage by defining one or multiple of the following macros (\b before including any %Eigen's header):
+
+\note For Mac users, in order to use the lapack version shipped with the Accelerate framework, you also need the lapacke library.
+Using <a href="https://www.macports.org/">MacPorts</a>, this is as easy as:
+\code
+sudo port install lapack
+\endcode
+and then use the following link flags: \c -framework \c Accelerate \c /opt/local/lib/lapack/liblapacke.dylib
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack (compatible with any F77 LAPACK interface)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower numerical robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+</table>
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to BLAS or LAPACK routines.
+These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The breadth of %Eigen functionality that can be substituted is listed in the table below.
+<table class="manual">
+<tr><th>Functional domain</th><th>Code example</th><th>BLAS/LAPACK routines</th></tr>
+<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1*m2.transpose();
+m1.selfadjointView<Lower>()*m2;
+m1*m2.triangularView<Upper>();
+m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
+\endcode</td><td>\code
+?gemm
+?symm/?hemm
+?trmm
+dsyrk/ssyrk
+\endcode</td></tr>
+<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1.adjoint()*b;
+m1.selfadjointView<Lower>()*b;
+m1.triangularView<Upper>()*b;
+\endcode</td><td>\code
+?gemv
+?symv/?hemv
+?trmv
+\endcode</td></tr>
+<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m1.lu().solve(v2);
+\endcode</td><td>\code
+?getrf
+\endcode</td></tr>
+<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m2.selfadjointView<Upper>().llt().solve(v2);
+\endcode</td><td>\code
+?potrf
+\endcode</td></tr>
+<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+m1.householderQr();
+m1.colPivHouseholderQr();
+\endcode</td><td>\code
+?geqrf
+?geqp3
+\endcode</td></tr>
+<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
+JacobiSVD<MatrixXd> svd;
+svd.compute(m1, ComputeThinV);
+\endcode</td><td>\code
+?gesvd
+\endcode</td></tr>
+<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+EigenSolver<MatrixXd> es(m1);
+ComplexEigenSolver<MatrixXcd> ces(m1);
+SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
+GeneralizedSelfAdjointEigenSolver<MatrixXd>
+    gsaes(m1+m1.transpose(),m2+m2.transpose());
+\endcode</td><td>\code
+?gees
+?gees
+?syev/?heev
+?syev/?heev,
+?potrf
+\endcode</td></tr>
+<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+RealSchur<MatrixXd> schurR(m1);
+ComplexSchur<MatrixXcd> schurC(m1);
+\endcode</td><td>\code
+?gees
+\endcode</td></tr>
+</table>
+In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+
+*/
+
+}
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@ -32,107 +32,45 @@

 namespace Eigen {

-/** \page TopicUsingIntelMKL Using Intel® Math Kernel Library from Eigen
+/** \page TopicUsingIntelMKL Using Intel® MKL from %Eigen

-\section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL)
+<!-- \section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL) -->
+
+Since %Eigen version 3.1 and later, users can benefit from built-in Intel® Math Kernel Library (MKL) optimizations with an installed copy of Intel MKL 10.3 (or later).

-Since Eigen version 3.1 and later, users can benefit from built-in Intel MKL optimizations with an installed copy of Intel MKL 10.3 (or later).
 <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php"> Intel MKL </a> provides highly optimized multi-threaded mathematical routines for x86-compatible architectures.
 Intel MKL is available on Linux, Mac and Windows for both Intel64 and IA32 architectures.

 \note
 Intel® MKL is a proprietary software and it is the responsibility of users to buy or register for community (free) Intel MKL licenses for their products. Moreover, the license of the user product has to allow linking to proprietary software that excludes any unmodified versions of the GPL.

-Using Intel MKL through Eigen is easy:
-# define the \c EIGEN_USE_MKL_ALL macro before including any Eigen's header
+Using Intel MKL through %Eigen is easy:
+-# define the \c EIGEN_USE_MKL_ALL macro before including any %Eigen's header
 -# link your program to MKL libraries (see the <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/">MKL linking advisor</a>)
 -# on a 64bits system, you must use the LP64 interface (not the ILP64 one)

-When doing so, a number of Eigen's algorithms are silently substituted with calls to Intel MKL routines.
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to Intel MKL routines.
 These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
 Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.

 In addition you can choose which parts will be substituted by defining one or multiple of the following macros:

 <table class="manual">
-<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface, not only Intel MKL)</td></tr>
-<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Intel Lapacke</a> C interface to Lapack (currently works with Intel MKL only)</td></tr>
-<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
 <tr class="alt"><td>\c EIGEN_USE_MKL_VML </td><td>Enables the use of Intel VML (vector operations)</td></tr>
 <tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
 </table>

+Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
+
 Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.

-
-\section TopicUsingIntelMKL_SupportedFeatures List of supported features
-
-The breadth of Eigen functionality covered by Intel MKL is listed in the table below.
+The following table summarizes the list of functions covered by \c EIGEN_USE_MKL_VML:
 <table class="manual">
-<tr><th>Functional domain</th><th>Code example</th><th>MKL routines</th></tr>
-<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1*m2.transpose();
-m1.selfadjointView<Lower>()*m2;
-m1*m2.triangularView<Upper>();
-m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
-\endcode</td><td>\code
-?gemm
-?symm/?hemm
-?trmm
-dsyrk/ssyrk
-\endcode</td></tr>
-<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1.adjoint()*b;
-m1.selfadjointView<Lower>()*b;
-m1.triangularView<Upper>()*b;
-\endcode</td><td>\code
-?gemv
-?symv/?hemv
-?trmv
-\endcode</td></tr>
-<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m1.lu().solve(v2);
-\endcode</td><td>\code
-?getrf
-\endcode</td></tr>
-<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m2.selfadjointView<Upper>().llt().solve(v2);
-\endcode</td><td>\code
-?potrf
-\endcode</td></tr>
-<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-m1.householderQr();
-m1.colPivHouseholderQr();
-\endcode</td><td>\code
-?geqrf
-?geqp3
-\endcode</td></tr>
-<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
-JacobiSVD<MatrixXd> svd;
-svd.compute(m1, ComputeThinV);
-\endcode</td><td>\code
-?gesvd
-\endcode</td></tr>
-<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-EigenSolver<MatrixXd> es(m1);
-ComplexEigenSolver<MatrixXcd> ces(m1);
-SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
-GeneralizedSelfAdjointEigenSolver<MatrixXd>
-    gsaes(m1+m1.transpose(),m2+m2.transpose());
-\endcode</td><td>\code
-?gees
-?gees
-?syev/?heev
-?syev/?heev,
-?potrf
-\endcode</td></tr>
-<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-RealSchur<MatrixXd> schurR(m1);
-ComplexSchur<MatrixXcd> schurC(m1);
-\endcode</td><td>\code
-?gees
-\endcode</td></tr>
-<tr><td>Vector Math \n \c EIGEN_USE_MKL_VML </td><td>\code
+<tr><th>Code example</th><th>MKL routines</th></tr>
+<tr><td>\code
 v2=v1.array().sin();
 v2=v1.array().asin();
 v2=v1.array().cos();
@ -156,7 +94,7 @@ v?Sqr
 v?Powx
 \endcode</td></tr>
 </table>
-In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+In the examples, v1 and v2 are dense vectors.


 \section TopicUsingIntelMKL_Links Links
--- a/doc/examples/Cwise_erf.cpp
+++ b/doc/examples/Cwise_erf.cpp
@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erf() << std::endl;
+}
--- a/doc/examples/Cwise_erfc.cpp
+++ b/doc/examples/Cwise_erfc.cpp
@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erfc() << std::endl;
+}
--- a/doc/examples/Cwise_lgamma.cpp
+++ b/doc/examples/Cwise_lgamma.cpp
@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(0.5,10,0,-1);
+  std::cout << v.lgamma() << std::endl;
+}
--- a/Show More
+++ b/Show More