diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d37f94f7..9a95b684b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-project(Eigen)
+project(Eigen3)
 
 cmake_minimum_required(VERSION 2.8.5)
 
@@ -8,6 +8,11 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
   message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 
+# Alias Eigen_*_DIR to Eigen3_*_DIR:
+
+set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
+set(Eigen_BINARY_DIR ${Eigen3_BINARY_DIR})
+
 # guard against bad build-type strings
 
 if (NOT CMAKE_BUILD_TYPE)
@@ -93,9 +98,11 @@ else()
 endif()
 
 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
-if(NOT WIN32)
+
+# Disable pkgconfig only for native Windows builds
+if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
   option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
-endif(NOT WIN32)
+endif()
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
@@ -120,7 +127,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
   # We assume that other compilers are partly compatible with GNUCC
 
-  # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
   # adding -Werror turns such warnings into errors
   check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
   if(COMPILER_SUPPORT_WERROR)
@@ -142,8 +149,11 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wwrite-strings")
   ei_add_cxx_compiler_flag("-Wformat-security")
   ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
+  ei_add_cxx_compiler_flag("-Wlogical-op")
   ei_add_cxx_compiler_flag("-Wenum-conversion")
   ei_add_cxx_compiler_flag("-Wc++11-extensions")
+  ei_add_cxx_compiler_flag("-Wdouble-promotion")
+#  ei_add_cxx_compiler_flag("-Wconversion")
   
   # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
   # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
@@ -159,7 +169,7 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-fno-common")
   ei_add_cxx_compiler_flag("-fstrict-aliasing")
   ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
-  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
   
   
   # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
@@ -402,7 +412,7 @@ if(EIGEN_BUILD_PKGCONFIG)
     install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
         DESTINATION ${PKGCONFIG_INSTALL_DIR}
         )
-endif(EIGEN_BUILD_PKGCONFIG)
+endif()
 
 add_subdirectory(Eigen)
 
diff --git a/Eigen/CMakeLists.txt b/Eigen/CMakeLists.txt
index a92dd6f6c..9eb502b79 100644
--- a/Eigen/CMakeLists.txt
+++ b/Eigen/CMakeLists.txt
@@ -16,4 +16,4 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 705a04cc4..369d1f5ec 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -31,7 +31,8 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/Core b/Eigen/Core
index d67cb67af..3fabc5a43 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -164,6 +164,7 @@
       #if EIGEN_COMP_ICC >= 1110
         #include <immintrin.h>
       #else
+        #include <mmintrin.h>
         #include <emmintrin.h>
         #include <xmmintrin.h>
         #ifdef  EIGEN_VECTORIZE_SSE3
@@ -259,6 +260,11 @@
 // for min/max:
 #include <algorithm>
 
+// for std::is_nothrow_move_assignable
+#ifdef EIGEN_INCLUDE_TYPE_TRAITS
+#include <type_traits>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
@@ -332,8 +338,8 @@ using std::ptrdiff_t;
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
-#include "src/Core/SpecialFunctions.h"
 #include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
@@ -368,23 +374,29 @@ using std::ptrdiff_t;
   #include "src/Core/arch/ZVector/Complex.h"
 #endif
 
+// Half float support
 #include "src/Core/arch/CUDA/Half.h"
+#include "src/Core/arch/CUDA/PacketMathHalf.h"
+#include "src/Core/arch/CUDA/TypeCasting.h"
 
 #if defined EIGEN_VECTORIZE_CUDA
   #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/PacketMathHalf.h"
   #include "src/Core/arch/CUDA/MathFunctions.h"
-  #include "src/Core/arch/CUDA/TypeCasting.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
 
+#include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
 #include "src/Core/functors/UnaryFunctors.h"
 #include "src/Core/functors/NullaryFunctors.h"
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"
 
+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#include "src/Core/arch/CUDA/Complex.h"
+
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -411,6 +423,7 @@ using std::ptrdiff_t;
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index ea93eb303..009e529e1 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -32,6 +32,7 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -44,9 +45,10 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/LU b/Eigen/LU
index 2d70c92de..6f6c55629 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -28,7 +28,8 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
diff --git a/Eigen/QR b/Eigen/QR
index 25c781cc1..80838e3bd 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -36,8 +36,9 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/SVD b/Eigen/SVD
index b353f3f54..86143c23d 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -31,12 +31,14 @@
   * \endcode
   */
 
+#include "src/misc/RealSvd2x2.h"
 #include "src/SVD/UpperBidiagonalization.h"
 #include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
+#include "src/misc/lapacke.h"
+#include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index 113f58ee5..59312a82d 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -43,7 +43,7 @@ namespace Eigen { struct SluMatrix; }
   * - class SuperLU: a supernodal sequential LU factorization.
   * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
   *
-  * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
   *
   * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
   *
diff --git a/Eigen/src/CMakeLists.txt b/Eigen/src/CMakeLists.txt
deleted file mode 100644
index c326f374d..000000000
--- a/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB Eigen_src_subdirectories "*")
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-foreach(f ${Eigen_src_subdirectories})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" )
-    add_subdirectory(${f})
-  endif()
-endforeach()
diff --git a/Eigen/src/Cholesky/CMakeLists.txt b/Eigen/src/Cholesky/CMakeLists.txt
deleted file mode 100644
index d01488b41..000000000
--- a/Eigen/src/Cholesky/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Cholesky_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Cholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Cholesky COMPONENT Devel
-  )
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 538aff956..fcee7b2e3 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -43,6 +43,8 @@ namespace internal {
   * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
   * decomposition to determine whether a system of equations has a solution.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
@@ -52,7 +54,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
@@ -61,7 +62,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
     typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
+    typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
@@ -97,6 +98,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     /** \brief Constructor with decomposition
       *
       * This calculates the decomposition for the input \a matrix.
+      *
       * \sa LDLT(Index size)
       */
     template<typename InputType>
@@ -110,6 +112,23 @@ template<typename _MatrixType, int _UpLo> class LDLT
       compute(matrix.derived());
     }
 
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LDLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
+    }
+
     /** Clear any existing decomposition
      * \sa rankUpdate(w,sigma)
      */
@@ -234,7 +253,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
+      return m_info;
     }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -262,6 +281,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     TmpMatrixType m_temporary;
     internal::SignMatrix m_sign;
     bool m_isInitialized;
+    ComputationInfo m_info;
 };
 
 namespace internal {
@@ -279,6 +299,8 @@ template<> struct ldlt_inplace<Lower>
     typedef typename TranspositionType::StorageIndex IndexType;
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
+    bool found_zero_pivot = false;
+    bool ret = true;
 
     if (size <= 1)
     {
@@ -337,9 +359,27 @@ template<> struct ldlt_inplace<Lower>
       // we should only make sure that we do not introduce INF or NaN values.
       // Remark that LAPACK also uses 0 as the cutoff value.
       RealScalar realAkk = numext::real(mat.coeffRef(k,k));
-      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
+
+      if(k==0 && !pivot_is_valid)
+      {
+        // The entire diagonal is zero, there is nothing more to do
+        // except filling the transpositions, and checking whether the matrix is zero.
+        sign = ZeroSign;
+        for(Index j = 0; j<size; ++j)
+        {
+          transpositions.coeffRef(j) = IndexType(j);
+          ret = ret && (mat.col(j).tail(size-j-1).array()==Scalar(0)).all();
+        }
+        return ret;
+      }
+
+      if((rs>0) && pivot_is_valid)
         A21 /= realAkk;
 
+      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
+      else if(!pivot_is_valid) found_zero_pivot = true;
+
       if (sign == PositiveSemiDef) {
         if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == NegativeSemiDef) {
@@ -350,7 +390,7 @@ template<> struct ldlt_inplace<Lower>
       }
     }
 
-    return true;
+    return ret;
   }
 
   // Reference for the algorithm: Davis and Hager, "Multiple Rank
@@ -474,7 +514,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputTyp
   m_temporary.resize(size);
   m_sign = internal::ZeroSign;
 
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success : NumericalIssue;
 
   m_isInitialized = true;
   return *this;
@@ -602,7 +642,6 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
   return res;
 }
 
-#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
   * \sa MatrixBase::ldlt()
@@ -624,7 +663,6 @@ MatrixBase<Derived>::ldlt() const
 {
   return LDLT<PlainObject>(derived());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 19578b216..ddf4875ab 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -41,6 +41,8 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
  /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
@@ -54,7 +56,6 @@ template<typename _MatrixType, int _UpLo> class LLT
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
     typedef typename MatrixType::Scalar Scalar;
@@ -95,6 +96,21 @@ template<typename _MatrixType, int _UpLo> class LLT
       compute(matrix.derived());
     }
 
+    /** \brief Constructs a LDLT factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa LLT(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit LLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_isInitialized(false)
+    {
+      compute(matrix.derived());
+    }
+
     /** \returns a view of the upper triangular matrix U */
     inline typename Traits::MatrixU matrixU() const
     {
@@ -491,7 +507,6 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
   return matrixL() * matrixL().adjoint().toDenseMatrix();
 }
 
-#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
   * \sa SelfAdjointView::llt()
@@ -513,7 +528,6 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
   return LLT<PlainObject,UpLo>(m_matrix);
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Cholesky/LLT_MKL.h b/Eigen/src/Cholesky/LLT_LAPACKE.h
similarity index 81%
rename from Eigen/src/Cholesky/LLT_MKL.h
rename to Eigen/src/Cholesky/LLT_LAPACKE.h
index 0d42cb5bc..bc6489e69 100644
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_LAPACKE.h
@@ -25,25 +25,22 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *     LLt decomposition based on LAPACKE_?potrf function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_LLT_MKL_H
-#define EIGEN_LLT_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-#include <iostream>
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H
 
 namespace Eigen { 
 
 namespace internal {
 
-template<typename Scalar> struct mkl_llt;
+template<typename Scalar> struct lapacke_llt;
 
-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
+#define EIGEN_LAPACKE_LLT(EIGTYPE, BLASTYPE, LAPACKE_PREFIX) \
+template<> struct lapacke_llt<EIGTYPE> \
 { \
   template<typename MatrixType> \
   static inline Index potrf(MatrixType& m, char uplo) \
@@ -53,13 +50,13 @@ template<> struct mkl_llt<EIGTYPE> \
     EIGTYPE* a; \
     eigen_assert(m.rows()==m.cols()); \
     /* Set up parameters for ?potrf */ \
-    size = m.rows(); \
+    size = convert_index<lapack_int>(m.rows()); \
     StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
     matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
     a = &(m.coeffRef(0,0)); \
-    lda = m.outerStride(); \
+    lda = convert_index<lapack_int>(m.outerStride()); \
 \
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
+    info = LAPACKE_##LAPACKE_PREFIX##potrf( matrix_order, uplo, size, (BLASTYPE*)a, lda ); \
     info = (info==0) ? -1 : info>0 ? info-1 : size; \
     return info; \
   } \
@@ -69,7 +66,7 @@ template<> struct llt_inplace<EIGTYPE, Lower> \
   template<typename MatrixType> \
   static Index blocked(MatrixType& m) \
   { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'L'); \
   } \
   template<typename MatrixType, typename VectorType> \
   static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@@ -80,7 +77,7 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
   template<typename MatrixType> \
   static Index blocked(MatrixType& m) \
   { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
+    return lapacke_llt<EIGTYPE>::potrf(m, 'U'); \
   } \
   template<typename MatrixType, typename VectorType> \
   static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
@@ -90,13 +87,13 @@ template<> struct llt_inplace<EIGTYPE, Upper> \
   } \
 };
 
-EIGEN_MKL_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LLT(double, double, d)
+EIGEN_LAPACKE_LLT(float, float, s)
+EIGEN_LAPACKE_LLT(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LLT(scomplex, lapack_complex_float, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_LLT_MKL_H
+#endif // EIGEN_LLT_LAPACKE_H
diff --git a/Eigen/src/CholmodSupport/CMakeLists.txt b/Eigen/src/CholmodSupport/CMakeLists.txt
deleted file mode 100644
index 814dfa613..000000000
--- a/Eigen/src/CholmodSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CholmodSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_CholmodSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/CholmodSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 7480d1e24..0d34269fd 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -37,7 +37,7 @@ struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : tra
   * storage layout.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
   *
   * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
   */
@@ -147,9 +147,9 @@ class Array
     }
 #endif
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    Array(Array&& other)
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
@@ -157,7 +157,7 @@ class Array
         Base::_set_noalias(other);
     }
     EIGEN_DEVICE_FUNC
-    Array& operator=(Array&& other)
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
       other.swap(*this);
       return *this;
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 0443e3032..f0232f65e 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -32,7 +32,7 @@ template<typename ExpressionType> class MatrixWrapper;
   * \tparam Derived is the derived type, e.g., an array or an expression type.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
   *
   * \sa class MatrixBase, \ref TopicClassHierarchy
   */
@@ -52,8 +52,6 @@ template<typename Derived> class ArrayBase
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     typedef DenseBase<Derived> Base;
-    using Base::operator*;
-    using Base::operator/;
     using Base::RowsAtCompileTime;
     using Base::ColsAtCompileTime;
     using Base::SizeAtCompileTime;
@@ -89,6 +87,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@@ -99,6 +98,7 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
@@ -178,7 +178,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -191,7 +191,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -217,7 +217,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index b1193e421..b7cc7c0e9 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -75,23 +75,24 @@ private:
     DstIsRowMajor = DstFlags&RowMajorBit,
     SrcIsRowMajor = SrcFlags&RowMajorBit,
     StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
-    MightVectorize = StorageOrdersAgree
+    MightVectorize = bool(StorageOrdersAgree)
                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
-                  && (functor_traits<AssignFunc>::PacketAccess),
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
     MayInnerVectorize  = MightVectorize
                        && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
                        && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
-                       && int(JointAlignment)>=int(InnerRequiredAlignment),
-    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
          so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize)
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
       /* slice vectorization can be slow, so we only want it if the slices are big, which is
          indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
   };
 
 public:
@@ -116,9 +117,9 @@ private:
                         : 1,
     UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
     MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
-                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
     MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
   };
 
 public:
@@ -130,11 +131,17 @@ public:
                                              : int(NoUnrolling)
                   )
               : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling)
-                                                                                             : int(NoUnrolling) )
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
+                          ? int(CompleteUnrolling)
+                          : int(NoUnrolling) )
               : int(Traversal) == int(LinearTraversal)
                 ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
                                               : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
               : int(NoUnrolling)
   };
 
@@ -156,6 +163,7 @@ public:
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(LinearPacketSize)
     EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
     EIGEN_DEBUG_VAR(StorageOrdersAgree)
     EIGEN_DEBUG_VAR(MightVectorize)
     EIGEN_DEBUG_VAR(MayLinearize)
@@ -163,6 +171,7 @@ public:
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
     EIGEN_DEBUG_VAR(UnrollingLimit)
     EIGEN_DEBUG_VAR(MayUnrollCompletely)
     EIGEN_DEBUG_VAR(MayUnrollInner)
@@ -256,13 +265,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment,
-    DefaultAlignment = unpacket_traits<PacketType>::alignment
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
   };
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    kernel.template assignPacketByOuterInner<DefaultAlignment, JointAlignment, PacketType>(outer, inner);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
     enum { NextIndex = Index + unpacket_traits<PacketType>::size };
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
   }
@@ -274,23 +283,20 @@ struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
-template<typename Kernel, int Index_, int Stop>
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
   typedef typename Kernel::PacketType PacketType;
-  enum {
-    DefaultAlignment = unpacket_traits<PacketType>::alignment
-  };
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
   {
-    kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, Index_);
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
     enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
-    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
   }
 };
 
-template<typename Kernel, int Stop>
-struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
 {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
 };
@@ -419,9 +425,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
     
     enum { size = DstXprType::SizeAtCompileTime,
-           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           packetSize =unpacket_traits<PacketType>::size,
            alignedSize = (size/packetSize)*packetSize };
 
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
@@ -438,7 +445,8 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
   typedef typename Kernel::PacketType PacketType;
   enum {
-    DefaultAlignment = unpacket_traits<PacketType>::alignment
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
   };
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
@@ -447,7 +455,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
     const Index packetSize = unpacket_traits<PacketType>::size;
     for(Index outer = 0; outer < outerSize; ++outer)
       for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, inner);
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
   }
 };
 
@@ -467,9 +475,11 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
     const Index outerSize = kernel.outerSize();
     for(Index outer = 0; outer < outerSize; ++outer)
-      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
   }
 };
 
@@ -518,7 +528,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
                                : int(Kernel::AssignmentTraits::DstAlignment)
     };
     const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
-    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
     {
       // the pointer is not aligend-on scalar, so alignment is not possible
       return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
@@ -549,6 +559,29 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
   }
 };
 
+#if EIGEN_UNALIGNED_VECTORIZE
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { size = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (size/packetSize)*packetSize };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+
 /***************************************************************************
 * Part 4 : Generic dense assignment kernel
 ***************************************************************************/
@@ -676,14 +709,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstX
     
   typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
 template<typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
-  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }
 
 /***************************************************************************
@@ -705,7 +738,7 @@ template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Ki
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
           typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
-          typename Scalar = typename DstXprType::Scalar>
+          typename EnableIf = void>
 struct Assignment;
 
 
@@ -718,13 +751,13 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment(Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment(const Dst& dst, const Src& src)
 {
-  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
                      
 // Deal with "assume-aliasing"
@@ -783,7 +816,7 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
 {
-  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 
 template<typename Dst, typename Src, typename Func>
@@ -805,15 +838,17 @@ template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
-  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
 
 // forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
 
 // Generic Dense to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
@@ -830,11 +865,13 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 
 // Generic assignment through evalTo.
 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
 {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     src.evalTo(dst);
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
old mode 100644
new mode 100755
index 897187a30..6c2ab9264
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -81,10 +81,10 @@ class vml_assign_traits
 
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
   template< typename DstXprType, typename SrcXprNested>                                                                         \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
     typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
         VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@@ -138,22 +138,24 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
 
 #define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
-  template< typename DstXprType, typename SrcXprNested>                                                                       \
-  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
-                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
-    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
-      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
       {                                                                                                                       \
-        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
               (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
       } else {                                                                                                                \
         const Index outerSize = dst.outerSize();                                                                              \
         for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
-          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
-                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
           EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
           VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
                  (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
diff --git a/Eigen/src/Core/CMakeLists.txt b/Eigen/src/Core/CMakeLists.txt
deleted file mode 100644
index 38c3afde9..000000000
--- a/Eigen/src/Core/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-FILE(GLOB Eigen_Core_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(products)
-ADD_SUBDIRECTORY(util)
-ADD_SUBDIRECTORY(arch)
-ADD_SUBDIRECTORY(functors)
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index 2abc6605c..d218e9814 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -80,9 +80,7 @@ struct CommaInitializer
   EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
   {
-    if(other.cols()==0 || other.rows()==0)
-      return *this;
-    if (m_col==m_xpr.cols())
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
     {
       m_row+=m_currentBlockRows;
       m_col = 0;
@@ -90,15 +88,11 @@ struct CommaInitializer
       eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
         && "Too many rows passed to comma initializer (operator<<)");
     }
-    eigen_assert(m_col<m_xpr.cols()
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
       && "Too many coefficients passed to comma initializer (operator<<)");
     eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
-                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
-                    (m_row, m_col) = other;
-    else
-      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
+                    (m_row, m_col, other.rows(), other.cols()) = other;
     m_col += other.cols();
     return *this;
   }
@@ -109,9 +103,7 @@ struct CommaInitializer
   EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
   {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
-         && m_col == m_xpr.cols()
-         && "Too few coefficients passed to comma initializer (operator<<)");
+      finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
@@ -122,7 +114,12 @@ struct CommaInitializer
     * \endcode
     */
   EIGEN_DEVICE_FUNC
-  inline XprType& finished() { return m_xpr; }
+  inline XprType& finished() {
+      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
+           && m_col == m_xpr.cols()
+           && "Too few coefficients passed to comma initializer (operator<<)");
+      return m_xpr;
+  }
 
   XprType& m_xpr;           // target expression
   Index m_row;              // current row id
diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
index 68c5e918e..aa7efdc76 100644
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -32,33 +32,6 @@ struct rcond_compute_sign<Vector, Vector, false> {
   }
 };
 
-/** \brief Reciprocal condition number estimator.
-  *
-  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
-  * this method estimates the condition number quickly and reliably in O(n^2)
-  * operations.
-  *
-  * \returns an estimate of the reciprocal condition number
-  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
-  * its decomposition. Supports the following decompositions: FullPivLU,
-  * PartialPivLU, LDLT, and LLT.
-  *
-  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
-  */
-template <typename Decomposition>
-typename Decomposition::RealScalar
-rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
-{
-  typedef typename Decomposition::RealScalar RealScalar;
-  eigen_assert(dec.rows() == dec.cols());
-  if (dec.rows() == 0)              return RealScalar(1);
-  if (matrix_norm == RealScalar(0)) return RealScalar(0);
-  if (dec.rows() == 1)              return RealScalar(1);
-  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
-  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
-                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
-}
-
 /**
   * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
   * \a matrix that implements .solve() and .adjoint().solve() methods.
@@ -94,7 +67,15 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
   if (n == 0)
     return 0;
 
+  // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
   Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 
   // lower_bound is a lower bound on
   //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
@@ -151,7 +132,8 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
   // Hager's algorithm to vastly underestimate ||matrix||_1.
   Scalar alternating_sign(RealScalar(1));
   for (Index i = 0; i < n; ++i) {
-    v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
     alternating_sign = -alternating_sign;
   }
   v = dec.solve(v);
@@ -159,6 +141,33 @@ typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomp
   return numext::maxi(lower_bound, alternate_lower_bound);
 }
 
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return RealScalar(1);
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
 }  // namespace internal
 
 }  // namespace Eigen
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 932178f53..00c079bd8 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -41,10 +41,19 @@ template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef Transp
 // We currently distinguish the following kind of evaluators:
 // - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
 // - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
 // - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
 // - mapbase_evaluator  for Map, Block, Ref
 // - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
 
+template< typename T,
+          typename Arg1Kind   = typename evaluator_traits<typename T::Arg1>::Kind,
+          typename Arg2Kind   = typename evaluator_traits<typename T::Arg2>::Kind,
+          typename Arg3Kind   = typename evaluator_traits<typename T::Arg3>::Kind,
+          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
+          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
+          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar> struct ternary_evaluator;
+
 template< typename T,
           typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
           typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
@@ -328,6 +337,120 @@ protected:
 // Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
 // Likewise, there is not need to more sophisticated dispatching here.
 
+template<typename Scalar,typename NullaryOp,
+         bool has_nullary = has_nullary_operator<NullaryOp>::value,
+         bool has_unary   = has_unary_operator<NullaryOp>::value,
+         bool has_binary  = has_binary_operator<NullaryOp>::value>
+struct nullary_wrapper
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp<T>(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,false,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp<T>(); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp<T>(i,j); }
+};
+
+// We need the following specialization for vector-only functors assigned to a runtime vector,
+// for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
+// In this case, i==0 and j is used for the actual iteration.
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,true,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op(i+j);
+  }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op.template packetOp<T>(i+j);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,false> {};
+
+#if 0 && EIGEN_COMP_MSVC>0
+// Disable this ugly workaround. This is now handled in traits<Ref>::match,
+// but this piece of code might still become handly if some other weird compilation
+// erros pop up again.
+
+// MSVC exhibits a weird compilation error when
+// compiling:
+//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
+//    Ref<const MatrixXf> R = 2.f*A;
+// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
+// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
+// and at that time has_*ary_operator<T> returns true regardless of T.
+// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
+// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
+// and packet() are really instantiated as implemented below:
+
+// This is a simple wrapper around Index to enforce the re-instantiation of
+// has_*ary_operator when needed.
+template<typename T> struct nullary_wrapper_workaround_msvc {
+  nullary_wrapper_workaround_msvc(const T&);
+  operator T()const;
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
+  }
+};
+#endif // MSVC workaround
+
 template<typename NullaryOp, typename PlainObjectType>
 struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
   : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
@@ -347,41 +470,44 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
   };
 
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
-    : m_functor(n.functor()) 
+    : m_functor(n.functor()), m_wrapper()
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
+  template <typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index row, Index col) const
+  CoeffReturnType coeff(IndexType row, IndexType col) const
   {
-    return m_functor(row, col);
+    return m_wrapper(m_functor, row, col);
   }
 
+  template <typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index index) const
+  CoeffReturnType coeff(IndexType index) const
   {
-    return m_functor(index);
+    return m_wrapper(m_functor,index);
   }
 
-  template<int LoadMode, typename PacketType>
+  template<int LoadMode, typename PacketType, typename IndexType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const
+  PacketType packet(IndexType row, IndexType col) const
   {
-    return m_functor.template packetOp<Index,PacketType>(row, col);
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
   }
 
-  template<int LoadMode, typename PacketType>
+  template<int LoadMode, typename PacketType, typename IndexType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const
+  PacketType packet(IndexType index) const
   {
-    return m_functor.template packetOp<Index,PacketType>(index);
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
   }
 
 protected:
   const NullaryOp m_functor;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
 // -------------------- CwiseUnaryOp --------------------
@@ -442,6 +568,96 @@ protected:
   evaluator<ArgType> m_argImpl;
 };
 
+// -------------------- CwiseTernaryOp --------------------
+
+// this is a ternary expression
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+  : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
+  : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
+    
+    Arg1Flags = evaluator<Arg1>::Flags,
+    Arg2Flags = evaluator<Arg2>::Flags,
+    Arg3Flags = evaluator<Arg3>::Flags,
+    SameType = is_same<typename Arg1::Scalar,typename Arg2::Scalar>::value && is_same<typename Arg1::Scalar,typename Arg3::Scalar>::value,
+    StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit),
+    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & (
+        HereditaryBits
+        | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(
+        EIGEN_PLAIN_ENUM_MIN(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
+        evaluator<Arg3>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_arg1Impl(xpr.arg1()), 
+      m_arg2Impl(xpr.arg2()), 
+      m_arg3Impl(xpr.arg3())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  const TernaryOp m_functor;
+  evaluator<Arg1> m_arg1Impl;
+  evaluator<Arg2> m_arg2Impl;
+  evaluator<Arg3> m_arg3Impl;
+};
+
 // -------------------- CwiseBinaryOp --------------------
 
 // this is a binary expression
@@ -601,73 +817,79 @@ struct mapbase_evaluator : evaluator_base<Derived>
     ColsAtCompileTime = XprType::ColsAtCompileTime,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
-  
+
   EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
-    : m_data(const_cast<PointerType>(map.data())),  
-      m_xpr(map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
   {
     EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
   }
- 
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const 
+  PacketType packet(Index row, Index col) const
   {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
     return internal::ploadt<PacketType, LoadMode>(ptr);
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const 
+  PacketType packet(Index index) const
   {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index row, Index col, const PacketType& x) 
+  void writePacket(Index row, Index col, const PacketType& x)
   {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
     return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketType& x) 
+  void writePacket(Index index, const PacketType& x)
   {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
   }
- 
 protected:
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+
   PointerType m_data;
-  const XprType& m_xpr;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
 template<typename PlainObjectType, int MapOptions, typename StrideType> 
@@ -755,9 +977,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
                              ? int(outer_stride_at_compile_time<ArgType>::ret)
                              : int(inner_stride_at_compile_time<ArgType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
     
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@@ -884,7 +1104,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
     // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
-    eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+    eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
   }
 };
 
@@ -1325,7 +1545,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
     
-    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit,
+    Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
     
     Alignment = 0
   };
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 39820fd7d..aa3297354 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -160,7 +160,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -173,7 +173,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 3c6508cd0..25c3ef3d7 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -20,7 +20,8 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
     Flags = traits<PlainObjectType>::Flags & RowMajorBit
   };
 };
-}
+
+} // namespace internal
 
 /** \class CwiseNullaryOp
   * \ingroup Core_Module
@@ -37,7 +38,23 @@ struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectT
   * However, if you want to write a function returning such an expression, you
   * will need to use this class.
   *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
+    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
+    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
   */
 template<typename NullaryOp, typename PlainObjectType>
 class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
@@ -62,30 +79,6 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return m_functor(rowId, colId);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_functor.packetOp(rowId, colId);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_functor(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return m_functor.packetOp(index);
-    }
-
     /** \returns the functor representing the nullary operation */
     EIGEN_DEVICE_FUNC
     const NullaryOp& functor() const { return m_functor; }
@@ -227,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
   * and yields faster code than the random access version.
   *
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -396,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
 /**
   * \brief Sets a linearly spaced vector.
   *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills *this with equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
   *
   * \only_for_vectors
diff --git a/Eigen/src/Core/CwiseTernaryOp.h b/Eigen/src/Core/CwiseTernaryOp.h
new file mode 100644
index 000000000..9f3576fec
--- /dev/null
+++ b/Eigen/src/Core/CwiseTernaryOp.h
@@ -0,0 +1,197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_TERNARY_OP_H
+#define EIGEN_CWISE_TERNARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+  // we must not inherit from traits<Arg1> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Arg1>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<TernaryOp(
+      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+      const typename Arg3::Scalar&)>::type Scalar;
+
+  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
+
+  typedef typename Arg1::Nested Arg1Nested;
+  typedef typename Arg2::Nested Arg2Nested;
+  typedef typename Arg3::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
+};
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl;
+
+/** \class CwiseTernaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+  *
+  * \tparam TernaryOp template functor implementing the operator
+  * \tparam Arg1Type the type of the first argument
+  * \tparam Arg2Type the type of the second argument
+  * \tparam Arg3Type the type of the third argument
+  *
+  * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+  * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+  * all three arguments are Eigen expressions.
+  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+  * CwiseTernaryOp types explicitly.
+  *
+  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
+          typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<
+                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                           typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator
+{
+ public:
+  typedef typename internal::remove_all<Arg1Type>::type Arg1;
+  typedef typename internal::remove_all<Arg2Type>::type Arg2;
+  typedef typename internal::remove_all<Arg3Type>::type Arg3;
+
+  typedef typename CwiseTernaryOpImpl<
+      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
+
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
+  typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
+                                     const Arg3& a3,
+                                     const TernaryOp& func = TernaryOp())
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
+    // require the sizes to match
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
+
+    // The index types should match
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
+                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index rows() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                RowsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                RowsAtCompileTime == Dynamic)
+      return m_arg3.rows();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     RowsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     RowsAtCompileTime == Dynamic)
+      return m_arg2.rows();
+    else
+      return m_arg1.rows();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index cols() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                ColsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                ColsAtCompileTime == Dynamic)
+      return m_arg3.cols();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     ColsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     ColsAtCompileTime == Dynamic)
+      return m_arg2.cols();
+    else
+      return m_arg1.cols();
+  }
+
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg1Nested& arg1() const { return m_arg1; }
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg2Nested& arg2() const { return m_arg2; }
+  /** \returns the third argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg3Nested& arg3() const { return m_arg3; }
+  /** \returns the functor representing the ternary operation */
+  EIGEN_DEVICE_FUNC
+  const TernaryOp& functor() const { return m_functor; }
+
+ protected:
+  Arg1Nested m_arg1;
+  Arg2Nested m_arg2;
+  Arg3Nested m_arg3;
+  const TernaryOp m_functor;
+};
+
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl
+    : public internal::generic_xpr_base<
+          CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<
+      CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_TERNARY_OP_H
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 5a38e5f22..c110bbf11 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -34,17 +34,15 @@ static inline void check_DenseIndex_is_signed() {
   * \tparam Derived is the derived type, e.g., a matrix type or an expression.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
   *
   * \sa \blank \ref TopicClassHierarchy
   */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
-                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
-                                            DenseCoeffsBase<Derived> >
-#else
   : public DenseCoeffsBase<Derived>
+#else
+  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
@@ -73,10 +71,8 @@ template<typename Derived> class DenseBase
     typedef Scalar value_type;
     
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;
+    typedef DenseCoeffsBase<Derived> Base;
 
-    using Base::operator*;
-    using Base::operator/;
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -562,12 +558,15 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC void reverseInPlace();
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     // disable the use of evalTo for dense objects with a nice compilation error
     template<typename Dest>
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 340484610..82201d96a 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -67,13 +67,13 @@ struct plain_array
   template<typename PtrType>
   EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #else
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
@@ -362,9 +362,9 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
       }
       return *this;
     }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_rows(std::move(other.m_rows))
       , m_cols(std::move(other.m_cols))
@@ -374,7 +374,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
       other.m_cols = 0;
     }
     EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
       using std::swap;
       swap(m_data, other.m_data);
@@ -441,9 +441,9 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
       }
       return *this;
     }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_cols(std::move(other.m_cols))
     {
@@ -451,7 +451,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
       other.m_cols = 0;
     }
     EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
       using std::swap;
       swap(m_data, other.m_data);
@@ -514,9 +514,9 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
       }
       return *this;
     }    
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    DenseStorage(DenseStorage&& other)
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
       , m_rows(std::move(other.m_rows))
     {
@@ -524,7 +524,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
       other.m_rows = 0;
     }
     EIGEN_DEVICE_FUNC
-    DenseStorage& operator=(DenseStorage&& other)
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
       using std::swap;
       swap(m_data, other.m_data);
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index 5a9e3abd4..92b2eee71 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -71,18 +71,17 @@ class DiagonalBase : public EigenBase<Derived>
       return InverseReturnType(diagonal().cwiseInverse());
     }
     
-    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
     EIGEN_DEVICE_FUNC
-    inline const ScalarMultipleReturnType
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
     operator*(const Scalar& scalar) const
     {
-      return ScalarMultipleReturnType(diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
     }
     EIGEN_DEVICE_FUNC
-    friend inline const ScalarMultipleReturnType
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
     operator*(const Scalar& scalar, const DiagonalBase& other)
     {
-      return ScalarMultipleReturnType(other.diagonal() * scalar);
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
     }
 };
 
@@ -317,19 +316,19 @@ struct Diagonal2Dense {};
 template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
 
 // Diagonal matrix to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     dst.setZero();
     dst.diagonal() = src.diagonal();
   }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.diagonal() += src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.diagonal() -= src.diagonal(); }
 };
 
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 82d58fc0b..1d7f2262e 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -28,22 +28,24 @@ template<typename T, typename U,
 >
 struct dot_nocheck
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.template binaryExpr<conj_prod>(b).sum();
   }
 };
 
 template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
   }
 };
 
@@ -62,7 +64,7 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -227,9 +229,12 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  static inline RealScalar run(const MatrixBase<Derived>& m)
   {
+    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
+      return RealScalar(0);
     return m.cwiseAbs().maxCoeff();
   }
 };
@@ -240,6 +245,8 @@ struct lpNorm_selector<Derived, Infinity>
   *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
   *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
   *
+  * In all cases, if \c *this is empty, then the value 0 is returned.
+  *
   * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
   *
   * \sa norm()
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index ba8e09674..f76995af9 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -138,7 +138,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -146,7 +146,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index f7c5f4276..a8c83f168 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   enum {
     ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
     PacketSize      = internal::packet_traits<Scalar>::size
   };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() {
     return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
             : m_data.array;
   }
   #endif
@@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
   
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
     ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
     ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 679b22f53..07fe0f005 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -62,6 +62,7 @@ struct default_packet_traits
     HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
+    HasLog1p  = 0,
     HasLog10  = 0,
     HasPow    = 0,
 
@@ -82,6 +83,7 @@ struct default_packet_traits
     HasErfc = 0,
     HasIGamma = 0,
     HasIGammac = 0,
+    HasBetaInc = 0,
 
     HasRound  = 0,
     HasFloor  = 0,
@@ -304,7 +306,7 @@ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a
   // 32-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
 #endif
-#elif !EIGEN_COMP_MSVC
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
   __builtin_prefetch(addr);
 #endif
 }
@@ -346,22 +348,6 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
 
-template<size_t offset, typename Packet>
-struct protate_impl
-{
-  // Empty so attempts to use this unimplemented path will fail to compile.
-  // Only specializations of this template should be used.
-};
-
-/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
-  * by the given offset, e.g. for offset == 1:
-  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
-  */
-template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
-{
-  return offset ? protate_impl<offset, Packet>::run(a) : a;
-}
-
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
@@ -419,6 +405,10 @@ Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }
 
+/** \internal \returns the log1p of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog1p(const Packet& a) { return numext::log1p(a); }
+
 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog10(const Packet& a) { using std::log10; return log10(a); }
@@ -445,38 +435,6 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 
-/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
-
-/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
-    
-/** \internal \returns the zeta function of two arguments (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
-
-/** \internal \returns the polygamma function (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
-
-/** \internal \returns the erf(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet perf(const Packet& a) { using numext::erf; return erf(a); }
-
-/** \internal \returns the erfc(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
-
-/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
-
-/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
-
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 05ba6ddb4..769dc255c 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,13 +11,30 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H
 
-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x
+
+    DOC_DETAILS
+
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
+    */ \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  NAME(const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
   template<typename Derived> \
   inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
   (NAME)(const Eigen::ArrayBase<Derived>& x) { \
     return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
   }
 
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
   \
   template<typename Derived> \
@@ -36,47 +53,68 @@
 
 namespace Eigen
 {
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(zeta,scalar_zeta_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(polygamma,scalar_polygamma_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
   
+  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+    *
+    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Derived,typename ScalarExponent>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
+#else
+  template<typename Derived,typename ScalarExponent>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
+    return x.derived().pow(exponent);
+  }
+
   template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
   pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
     return x.derived().pow(exponent);
   }
+#endif
 
   /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
     *
@@ -86,12 +124,14 @@ namespace Eigen
     * Output: \verbinclude Cwise_array_power_array.out
     * 
     * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
     */
   template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
   pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
   {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
@@ -100,66 +140,39 @@ namespace Eigen
   /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
     *
     * This function computes the coefficient-wise power between a scalar and an array of exponents.
-    * Beaware that the scalar type of the input scalar \a x and the exponents \a exponents must be the same.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
     *
     * Example: \include Cwise_scalar_power_array.cpp
     * Output: \verbinclude Cwise_scalar_power_array.out
     * 
     * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
     */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Scalar,typename Derived>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
+  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
+#else
+  template<typename Scalar, typename Derived>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
+          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
+            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  }
+
   template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
-  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents) 
+  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
   {
-    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
-      constant_x,
-      exponents.derived()
-    );
-  }
-  
-  /**
-  * \brief Component-wise division of a scalar by array elements.
-  **/
-  template <typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
-    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
-  {
-    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
-      a.derived(),
-      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
-    );
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
+      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
   }
+#endif
 
-  /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
-    *
-    * This function computes the coefficient-wise incomplete gamma function.
-    *
-    */
-  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
-  igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
-      a.derived(),
-      x.derived()
-    );
-  }
-
-  /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
-    *
-    * This function computes the coefficient-wise complementary incomplete gamma function.
-    *
-    */
-  template<typename Derived,typename ExponentDerived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
-  igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
-      a.derived(),
-      x.derived()
-    );
-  }
 
   namespace internal
   {
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index dfd9097cc..94e00f58b 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -125,31 +125,17 @@ DenseBase<Derived>::format(const IOFormat& fmt) const
 
 namespace internal {
 
-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    using std::ceil;
-    using std::log;
-    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
-  }
-};
-
-template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
-{
-  static inline int run()
-  {
-    return 0;
-  }
-};
-
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
 template<typename Scalar>
 struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
+{
+  static inline int run()
+  {
+    return NumTraits<Scalar>::digits10();
+  }
+};
 
 /** \internal
   * print the matrix \a _m to the output stream \a s using the output format \a fmt */
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index f3ec84990..f303aebf9 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -50,7 +50,7 @@ public:
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef typename internal::remove_all<XprType>::type NestedExpression;
   
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
     : m_xpr(xpr)
   {}
 
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 12c464a5a..020f939ad 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -17,10 +17,20 @@
 
 namespace Eigen { 
 
-/** \class MapBase
-  * \ingroup Core_Module
+/** \ingroup Core_Module
   *
-  * \brief Base class for Map and Block expression with direct access
+  * \brief Base class for dense Map and Block expression with direct access
+  *
+  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+  * Map and Block objects with direct access.
+  * Typical users do not have to directly deal with this class.
+  *
+  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+  *
+  * The \c Derived class has to provide the following two methods describing the memory layout:
+  *  \code Index innerStride() const; \endcode
+  *  \code Index outerStride() const; \endcode
   *
   * \sa class Map, class Block
   */
@@ -75,7 +85,9 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
+    /** \copydoc DenseBase::rows() */
     EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    /** \copydoc DenseBase::cols() */
     EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
@@ -86,12 +98,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       */
     EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
 
+    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index rowId, Index colId) const
     {
       return m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeff(Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index index) const
     {
@@ -99,12 +113,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       return m_data[index * innerStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return this->m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    /** \copydoc PlainObjectBase::coeffRef(Index) const */
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
@@ -112,6 +128,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       return this->m_data[index * innerStride()];
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index rowId, Index colId) const
     {
@@ -119,6 +136,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
                (m_data + (colId * colStride() + rowId * rowStride()));
     }
 
+    /** \internal */
     template<int LoadMode>
     inline PacketScalar packet(Index index) const
     {
@@ -126,6 +144,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
     }
 
+    /** \internal Constructor for fixed size matrices or vectors */
     EIGEN_DEVICE_FUNC
     explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
@@ -133,6 +152,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       checkSanity<Derived>();
     }
 
+    /** \internal Constructor for dynamically sized vectors */
     EIGEN_DEVICE_FUNC
     inline MapBase(PointerType dataPtr, Index vecSize)
             : m_data(dataPtr),
@@ -145,6 +165,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       checkSanity<Derived>();
     }
 
+    /** \internal Constructor for dynamically sized matrices */
     EIGEN_DEVICE_FUNC
     inline MapBase(PointerType dataPtr, Index rows, Index cols)
             : m_data(dataPtr), m_rows(rows), m_cols(cols)
@@ -166,7 +187,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
     {
 #if EIGEN_MAX_ALIGN_BYTES>0
-      eigen_assert((   ((size_t(m_data) % internal::traits<Derived>::Alignment) == 0)
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
                     || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
     }
@@ -181,6 +202,16 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };
 
+/** \ingroup Core_Module
+  *
+  * \brief Base class for non-const dense Map and Block expression with direct access
+  *
+  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+  * dense Map and Block objects with direct access.
+  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+  *
+  * \sa class Map, class Block
+  */
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 5771abf7d..8d47fb8a4 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -11,7 +11,9 @@
 #define EIGEN_MATHFUNCTIONS_H
 
 // source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406
+// TODO this should better be moved to NumTraits
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
+
 
 namespace Eigen {
 
@@ -95,6 +97,19 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct real_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.real();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct real_retval
 {
@@ -130,6 +145,19 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct imag_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.imag();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct imag_retval
 {
@@ -457,30 +485,33 @@ struct arg_retval
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
-template<typename Scalar, bool isComplex = NumTraits<Scalar>::IsComplex >
-struct log1p_impl
-{
-  static inline Scalar run(const Scalar& x)
-  {
+
+namespace std_fallback {
+  // fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::log1p function available
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
     return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
   }
-};
+}
 
-#if EIGEN_HAS_CXX11_MATH
 template<typename Scalar>
-struct log1p_impl<Scalar, false> {
+struct log1p_impl {
   static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
     using std::log1p;
+    #endif
+    using std_fallback::log1p;
     return log1p(x);
   }
 };
-#endif
+
 
 template<typename Scalar>
 struct log1p_retval
@@ -492,24 +523,26 @@ struct log1p_retval
 * Implementation of pow                                                  *
 ****************************************************************************/
 
-template<typename Scalar, bool IsInteger>
-struct pow_default_impl
+template<typename ScalarX,typename ScalarY, bool IsInteger = NumTraits<ScalarX>::IsInteger&&NumTraits<ScalarY>::IsInteger>
+struct pow_impl
 {
-  typedef Scalar retval;
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y)
+  //typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
   {
     EIGEN_USING_STD_MATH(pow);
     return pow(x, y);
   }
 };
 
-template<typename Scalar>
-struct pow_default_impl<Scalar, true>
+template<typename ScalarX,typename ScalarY>
+struct pow_impl<ScalarX,ScalarY, true>
 {
-  static EIGEN_DEVICE_FUNC inline Scalar run(Scalar x, Scalar y)
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y)
   {
-    Scalar res(1);
-    eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0);
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
     if(y & 1) res *= x;
     y >>= 1;
     while(y)
@@ -522,15 +555,6 @@ struct pow_default_impl<Scalar, true>
   }
 };
 
-template<typename Scalar>
-struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
-
-template<typename Scalar>
-struct pow_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of random                                               *
 ****************************************************************************/
@@ -620,16 +644,18 @@ struct random_default_impl<Scalar, false, true>
     typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
     if(y<x)
       return x;
+    // the following difference might overflow on a 32 bits system,
+    // but since y>=x the result converted to an unsigned long is still correct.
     std::size_t range = ScalarX(y)-ScalarX(x);
     std::size_t offset = 0;
     // rejection sampling
-    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
-    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
-
+    std::size_t divisor = 1;
+    std::size_t multiplier = 1;
+    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
+    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
     do {
-      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+      offset = (std::size_t(std::rand()) * multiplier) / divisor;
     } while (offset > range);
-
     return Scalar(ScalarX(x) + offset);
   }
 
@@ -790,6 +816,8 @@ template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>&
 template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
 template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
 
+template<typename T> T generic_fast_tanh_float(const T& a_x);
+
 } // end namespace internal
 
 /****************************************************************************
@@ -825,7 +853,7 @@ template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
-  return fmin(x, y);
+  return fminf(x, y);
 }
 template<typename T>
 EIGEN_DEVICE_FUNC
@@ -837,7 +865,7 @@ template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
-  return fmax(x, y);
+  return fmaxf(x, y);
 }
 #endif
 
@@ -847,7 +875,7 @@ EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}  
+}
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@@ -926,11 +954,19 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
-template<typename Scalar>
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log1p(const float &x) { return ::log1pf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log1p(const double &x) { return ::log1p(x); }
+#endif
+
+template<typename ScalarX,typename ScalarY>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
+inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const ScalarX& x, const ScalarY& y)
 {
-  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
+  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }
 
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
@@ -1036,6 +1072,16 @@ float abs(const float &x) { return ::fabsf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const double &x) { return ::fabs(x); }
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const std::complex<float>& x) {
+  return ::hypotf(x.real(), x.imag());
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const std::complex<double>& x) {
+  return ::hypot(x.real(), x.imag());
+}
 #endif
 
 template<typename T>
@@ -1181,6 +1227,11 @@ T tanh(const T &x) {
   return tanh(x);
 }
 
+#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(float x) { return internal::generic_fast_tanh_float(x); }
+#endif
+
 #ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
@@ -1192,7 +1243,7 @@ double tanh(const double &x) { return ::tanh(x); }
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(floor);
+  EIGEN_USING_STD_MATH(fmod);
   return fmod(a, b);
 }
 
@@ -1287,11 +1338,12 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
     return numext::abs2(x) <= numext::abs2(y) * prec * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
     return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
new file mode 100644
index 000000000..3c9ef22fa
--- /dev/null
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-9, 9], outside of which
+    the tanh(x) = +/-1.
+
+    This implementation works on both scalars and packets.
+*/
+template<typename T>
+T generic_fast_tanh_float(const T& a_x)
+{
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_9 = pset1<T>(9.f);
+  const T minus_9 = pset1<T>(-9.f);
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
+  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
+  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
+  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
+  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
+  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
+  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(4.89352518554385e-03f);
+  const T beta_2 = pset1<T>(2.26843463243900e-03f);
+  const T beta_4 = pset1<T>(1.18534705686654e-04f);
+  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_6, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONSIMPL_H
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index bcbbbf9ae..90c336d8c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -27,7 +27,7 @@ private:
       default_alignment = compute_default_alignment<_Scalar,max_size>::value,
       actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
       required_alignment = unpacket_traits<PacketScalar>::alignment,
-      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
     };
     
 public:
@@ -106,7 +106,7 @@ public:
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
   *
   * <i><b>Some notes:</b></i>
   *
@@ -268,9 +268,9 @@ class Matrix
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    Matrix(Matrix&& other)
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
@@ -278,7 +278,7 @@ class Matrix
         Base::_set_noalias(other);
     }
     EIGEN_DEVICE_FUNC
-    Matrix& operator=(Matrix&& other)
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
       other.swap(*this);
       return *this;
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 1e66b4e1b..d56df8249 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -41,7 +41,7 @@ namespace Eigen {
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
   *
   * \sa \blank \ref TopicClassHierarchy
   */
@@ -80,8 +80,6 @@ template<typename Derived> class MatrixBase
     using Base::operator-=;
     using Base::operator*=;
     using Base::operator/=;
-    using Base::operator*;
-    using Base::operator/;
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
     typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -100,7 +98,7 @@ template<typename Derived> class MatrixBase
     /** \returns the size of the main diagonal, which is min(rows(),cols()).
       * \sa rows(), cols(), SizeAtCompileTime. */
     EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
 
     typedef typename Base::PlainObject PlainObject;
 
@@ -123,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -131,6 +130,7 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
@@ -195,7 +195,7 @@ template<typename Derived> class MatrixBase
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
     dot(const MatrixBase<OtherDerived>& other) const;
 
     EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
@@ -330,15 +330,11 @@ template<typename Derived> class MatrixBase
 
 /////////// LU module ///////////
 
-    EIGEN_DEVICE_FUNC
     inline const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC
     inline const PartialPivLU<PlainObject> partialPivLu() const;
 
-    EIGEN_DEVICE_FUNC
     inline const PartialPivLU<PlainObject> lu() const;
 
-    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -383,7 +379,7 @@ template<typename Derived> class MatrixBase
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /// \internal helper struct to form the return type of the cross product
     template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
       typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
     };
     #endif // EIGEN_PARSED_BY_DOXYGEN
@@ -405,7 +401,6 @@ template<typename Derived> class MatrixBase
 
     inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
 
-    inline ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
     enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
                                           : ColsAtCompileTime==1 ? Vertical : Horizontal };
@@ -418,8 +413,7 @@ template<typename Derived> class MatrixBase
     typedef Block<const Derived,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
                   internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
-                const ConstStartMinusOne > HNormalizedReturnType;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
 
     inline const HNormalizedReturnType hnormalized() const;
 
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index ffb673cee..33908010b 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -39,7 +39,7 @@ class NoAlias
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
     {
-      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
     
@@ -47,7 +47,7 @@ class NoAlias
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
     {
-      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
     
@@ -55,7 +55,7 @@ class NoAlias
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
     {
-      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
       return m_expression;
     }
 
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index e065fa714..dd61195bc 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -12,6 +12,37 @@
 
 namespace Eigen {
 
+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
 /** \class NumTraits
   * \ingroup Core_Module
   *
@@ -22,14 +53,16 @@ namespace Eigen {
   * This class stores enums, typedefs and static methods giving information about a numeric type.
   *
   * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
   *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
   *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
   *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
   *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
   *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
   * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
   *     this means, just use \a T here.
   * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
@@ -42,10 +75,14 @@ namespace Eigen {
   * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
   * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
   *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
   * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
   *     value by the fuzzy comparison operators.
   * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
   */
 
 template<typename T> struct GenericNumTraits
@@ -60,23 +97,6 @@ template<typename T> struct GenericNumTraits
     MulCost = 1
   };
 
-  // Division is messy but important, because it is expensive and throughput
-  // varies significantly. The following numbers are based on min division
-  // throughput on Haswell.
-  template<bool Vectorized>
-  struct Div {
-    enum {
-#ifdef EIGEN_VECTORIZE_AVX
-      AVX = true,
-#else
-      AVX = false,
-#endif
-      Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
-          Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
-    };
-  };
-
-
   typedef T Real;
   typedef typename internal::conditional<
                      IsInteger,
@@ -84,12 +104,20 @@ template<typename T> struct GenericNumTraits
                      T
                    >::type NonInteger;
   typedef T Nested;
+  typedef T Literal;
 
   EIGEN_DEVICE_FUNC
   static inline Real epsilon()
   {
     return numext::numeric_limits<T>::epsilon();
   }
+
+  EIGEN_DEVICE_FUNC
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
   EIGEN_DEVICE_FUNC
   static inline Real dummy_precision()
   {
@@ -145,6 +173,7 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
   : GenericNumTraits<std::complex<_Real> >
 {
   typedef _Real Real;
+  typedef typename NumTraits<_Real>::Literal Literal;
   enum {
     IsComplex = 1,
     RequireInitialization = NumTraits<_Real>::RequireInitialization,
@@ -157,6 +186,8 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
   static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
   EIGEN_DEVICE_FUNC
   static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -168,6 +199,7 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
   typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
   typedef ArrayType & Nested;
+  typedef typename NumTraits<Scalar>::Literal Literal;
 
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
@@ -185,6 +217,30 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 };
 
+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
+
+  static inline int digits10() { return 0; }
+
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
+};
+
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template<> struct NumTraits<void> {};
+
 } // end namespace Eigen
 
 #endif // EIGEN_NUMTRAITS_H
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index b7a4fcea8..55b4ac057 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -59,33 +59,34 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 } // end namespace internal
 
 /** \class PlainObjectBase
+  * \ingroup Core_Module
   * \brief %Dense storage base class for matrices and arrays.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
   *
   * \sa \ref TopicClassHierarchy
   */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
+namespace doxygen {
 
 // this is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
+template<typename Derived> struct dense_xpr_base_dispatcher;
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
     : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
     : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
 
-} // namespace internal
+} // namespace doxygen
 
 template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
 #else
 template<typename Derived>
 class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
@@ -145,6 +146,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
     {
@@ -154,12 +159,20 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       return m_storage.data()[index];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
     {
@@ -169,12 +182,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       return m_storage.data()[index];
     }
 
+    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+      * It is provided for convenience. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
     {
@@ -184,6 +203,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+      * It is provided for convenience. */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
     {
@@ -471,15 +492,15 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 #endif
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
-    PlainObjectBase(PlainObjectBase&& other)
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
       : m_storage( std::move(other.m_storage) )
     {
     }
 
     EIGEN_DEVICE_FUNC
-    PlainObjectBase& operator=(PlainObjectBase&& other)
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
     {
       using std::swap;
       swap(m_storage, other.m_storage);
@@ -697,7 +718,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       //_resize_to_match(other);
       // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
       // it wouldn't allow to copy a row-vector into a column-vector.
-      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
       return this->derived();
     }
 
@@ -713,11 +734,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     
     template<typename T0, typename T1>
     EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
     }
     
     template<typename T0, typename T1>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 8aa1de081..ae0c94b38 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -16,39 +16,6 @@ template<typename Lhs, typename Rhs, int Option, typename StorageKind> class Pro
 
 namespace internal {
 
-// Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
-// Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
-template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
-         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
-struct product_result_scalar
-{
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename RhsShape>
-struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
-{
-  typedef typename Rhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename LhsShape>
-  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
-{
-  typedef typename Lhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename RhsShape>
-struct product_result_scalar<Lhs, Rhs, TranspositionsShape, RhsShape>
-{
-  typedef typename Rhs::Scalar Scalar;
-};
-
-template<typename Lhs, typename Rhs, typename LhsShape>
-  struct product_result_scalar<Lhs, Rhs, LhsShape, TranspositionsShape>
-{
-  typedef typename Lhs::Scalar Scalar;
-};
-
 template<typename Lhs, typename Rhs, int Option>
 struct traits<Product<Lhs, Rhs, Option> >
 {
@@ -59,7 +26,7 @@ struct traits<Product<Lhs, Rhs, Option> >
   
   typedef MatrixXpr XprKind;
   
-  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
   typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
                                                 typename RhsTraits::StorageKind,
                                                 internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index d9fd888cf..63faca822 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -35,22 +35,28 @@ struct evaluator<Product<Lhs, Rhs, Options> >
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
  
-// Catch scalar * ( A * B ) and transform it to (A*scalar) * B
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
-template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator_assume_aliasing<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct> > >
 {
   static const bool value = true;
 };
-template<typename Lhs, typename Rhs, typename Scalar>
-struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
- : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > >
+ : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
-  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
-  
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
   {}
 };
 
@@ -122,13 +128,17 @@ protected:
   PlainObject m_result;
 };
 
+// The following three shortcuts are enabled only if the scalar types match excatly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
+
 // Dense = Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
@@ -137,11 +147,12 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
 
 // Dense += Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
   {
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
@@ -150,11 +161,12 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
 
 // Dense -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar>, Dense2Dense,
-  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
   {
     // FIXME shall we handle nested_eval here?
     generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
@@ -165,55 +177,57 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
 // Dense ?= scalar * Product
 // TODO we should apply that rule if that's really helpful
 // for instance, this is not good for inner products
-template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis>
-struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
 {
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
-                       const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
-    call_assignment_no_alias(dst, (src.functor().m_other * src.nestedExpression().lhs())*src.nestedExpression().rhs(), func);
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
   }
 };
 
 //----------------------------------------
 // Catch "Dense ?= xpr + Product<>" expression to save one temporary
 // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
-// TODO enable it for "Dense ?= xpr - Product<>" as well.
 
 template<typename OtherXpr, typename Lhs, typename Rhs>
-struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar>, const OtherXpr,
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
                                                const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
   static const bool value = true;
 };
 
-template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
-struct assignment_from_xpr_plus_product
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product
 {
-  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr, const ProductType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const Func1& func)
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
   {
-    call_assignment_no_alias(dst, src.lhs(), func);
+    call_assignment_no_alias(dst, src.lhs(), Func1());
     call_assignment_no_alias(dst, src.rhs(), Func2());
   }
 };
 
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::assign_op<Scalar>, Dense2Dense>
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::assign_op<Scalar>, internal::add_assign_op<Scalar> >
-{};
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::add_assign_op<Scalar>, Dense2Dense>
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::add_assign_op<Scalar>, internal::add_assign_op<Scalar> >
-{};
-template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
-                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::sub_assign_op<Scalar>, Dense2Dense>
-  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::sub_assign_op<Scalar>, internal::sub_assign_op<Scalar> >
-{};
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+
 //----------------------------------------
 
 template<typename Lhs, typename Rhs>
@@ -243,7 +257,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
   typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
@@ -251,12 +265,12 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
   for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
   typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
@@ -264,7 +278,7 @@ EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
   for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }
 
 template<typename Lhs, typename Rhs>
@@ -319,19 +333,19 @@ struct generic_product_impl_base
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
   
   template<typename Dst>
-  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 
 };
@@ -345,7 +359,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
 
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     internal::gemv_dense_selector<Side,
                             (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
@@ -360,25 +374,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
   }
   
   template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() += lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
   }
   
   template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() -= lhs.lazyProduct(rhs);
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<Scalar>());
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
   
 //   template<typename Dst>
@@ -423,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
   }
 
   // Everything below here is taken from CoeffBasedProduct.h
@@ -473,15 +499,12 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
       
     SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
 
-    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ),
-
-    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ),
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
 
     EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                    : (RhsRowMajor && !CanVectorizeLhs),
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
 
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
           | (EvalToRowMajor ? RowMajorBit : 0)
@@ -492,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
     RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
 
-    Alignment = CanVectorizeLhs ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : CanVectorizeRhs ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
               : 0,
 
     /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@@ -519,8 +542,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    */
   EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
   {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
     return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
   }
 
@@ -538,8 +561,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   template<int LoadMode, typename PacketType>
   const PacketType packet(Index index) const
   {
-    const Index row = RowsAtCompileTime == 1 ? 0 : index;
-    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
     return packet<LoadMode,PacketType>(row,col);
   }
 
@@ -579,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
   }
 };
 
@@ -589,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
   }
 };
 
@@ -598,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
   }
 };
 
@@ -607,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
 };
 
@@ -616,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
@@ -625,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
@@ -634,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
   }
@@ -645,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }
@@ -730,7 +753,7 @@ template<typename MatrixType, typename DiagonalType, typename Derived, int Produ
 struct diagonal_product_evaluator_base
   : evaluator_base<Derived>
 {
-   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
   enum {
     CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 02038e9e3..6faf789c7 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -16,8 +16,7 @@ namespace internal {
 
 template<typename Scalar> struct scalar_random_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-  template<typename Index>
-  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
+  inline const Scalar operator() () const { return random<Scalar>(); }
 };
 
 template<typename Scalar>
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 98b2fd868..b6e8f8887 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -38,8 +38,8 @@ public:
   enum {
     MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
 public:
@@ -425,7 +425,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }
 
 /** \returns the maximum of all coefficients of \c *this.
@@ -435,10 +435,12 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
 }
 
-/** \returns the sum of all coefficients of *this
+/** \returns the sum of all coefficients of \c *this
+  *
+  * If \c *this is empty, then the value 0 is returned.
   *
   * \sa trace(), prod(), mean()
   */
@@ -448,7 +450,7 @@ DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(0);
-  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
 }
 
 /** \returns the mean of all coefficients of *this
@@ -459,7 +461,14 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 }
 
 /** \returns the product of all coefficients of *this
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 6e94181f3..bdf24f52a 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -35,7 +35,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
                       || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
       OuterStrideMatch = Derived::IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
       ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
       MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
@@ -262,7 +268,7 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     template<typename Expression>
     EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
     {
-      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
       Base::construct(m_object);
     }
 
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 9fda02691..62d4180da 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -55,6 +55,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     typedef TriangularBase<SelfAdjointView> Base;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+    typedef MatrixTypeNestedCleaned NestedExpression;
 
     /** \brief The type of coefficients in this matrix */
     typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
@@ -128,7 +129,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     }
     
     friend EIGEN_DEVICE_FUNC
-    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
     operator*(const Scalar& s, const SelfAdjointView& mat)
     {
       return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
@@ -162,6 +163,41 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
+    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+      *
+      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+      *
+      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
+      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
+      *
+      * \sa MatrixBase::triangularView(), class TriangularView
+      */
+    template<unsigned int TriMode>
+    EIGEN_DEVICE_FUNC
+    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
+    triangularView() const
+    {
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
+      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
+    }
+
+    /** \returns a const expression of the main diagonal of the matrix \c *this
+      *
+      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+      *
+      * \sa MatrixBase::diagonal(), class Diagonal */
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::ConstDiagonalReturnType diagonal() const
+    {
+      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+    }
+
 /////////// Cholesky module ///////////
 
     const LLT<PlainObject, UpLo> llt() const;
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 78fff1549..719ed72a5 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -12,11 +12,13 @@
 
 namespace Eigen { 
 
+// TODO generalize the scalar type of 'other'
+
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
   return derived();
 }
 
@@ -24,7 +26,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
   return derived();
 }
 
@@ -32,7 +34,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
 
@@ -40,7 +42,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }
 
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index ba2ee53b8..8fc69c4b8 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -134,10 +134,10 @@ protected:
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     // FIXME shall we resize dst here?
     src.dec()._solve_impl(src.rhs(), dst);
@@ -146,10 +146,10 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
 
 // Specialization for "dst = dec.transpose().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
   }
@@ -157,10 +157,11 @@ struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal:
 
 // Specialization for "dst = dec.adjoint().solve(rhs)"
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
   }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index a33356423..96d3dde50 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -169,7 +169,7 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
   typedef typename internal::conditional<copy,
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other);
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 5c5e5028e..e9606ec33 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -367,14 +367,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     template<typename Other>
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator+=(const DenseBase<Other>& other) {
-      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
       return derived();
     }
     /** \sa MatrixBase::operator-=() */
     template<typename Other>
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator-=(const DenseBase<Other>& other) {
-      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
       return derived();
     }
     
@@ -552,7 +552,7 @@ template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
-  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar>());
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -794,7 +794,7 @@ void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& sr
   enum {
       unroll = DstXprType::SizeAtCompileTime != Dynamic
             && SrcEvaluatorType::CoeffReadCost < HugeCost
-            && DstXprType::SizeAtCompileTime * SrcEvaluatorType::CoeffReadCost / 2 <= EIGEN_UNROLLING_LIMIT
+            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
     };
   
   triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
@@ -804,7 +804,7 @@ template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
-  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
 }
 
 template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
@@ -812,8 +812,8 @@ template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Tria
 template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
 
 
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
@@ -823,8 +823,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar
   }
 };
 
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
@@ -832,8 +832,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
   }
 };
 
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
@@ -933,10 +933,10 @@ namespace internal {
   
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
     dst.setZero();
     dst._assignProduct(src, 1);
@@ -945,10 +945,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
 
 // Triangular += Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
     dst._assignProduct(src, 1);
   }
@@ -956,10 +956,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
 
 // Triangular -= Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
-struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar>, Dense2Triangular, Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
 {
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
     dst._assignProduct(src, -1);
   }
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 193891189..dd382e990 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -284,6 +284,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
     typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
     typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
     typedef Reverse<ExpressionType, Direction> ReverseReturnType;
 
     template<int p> struct LpNormReturnType {
@@ -456,7 +457,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       *
       * \sa DenseBase::reverse() */
     EIGEN_DEVICE_FUNC
-    const ReverseReturnType reverse() const
+    const ConstReverseReturnType reverse() const
+    { return ConstReverseReturnType( _expression() ); }
+
+    /** \returns a writable matrix expression
+      * where each column (or row) are reversed.
+      *
+      * \sa reverse() const */
+    EIGEN_DEVICE_FUNC
+    ReverseReturnType reverse()
     { return ReverseReturnType( _expression() ); }
 
     typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
@@ -540,7 +549,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
     template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator+(const DenseBase<OtherDerived>& other) const
@@ -553,7 +562,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
     operator-(const DenseBase<OtherDerived>& other) const
diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt
deleted file mode 100644
index bdb71ab99..000000000
--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_AVX_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 98d8e029f..d21ec39dd 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -266,52 +266,10 @@ pexp<Packet8f>(const Packet8f& _x) {
 }
 
 // Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-ptanh<Packet8f>(const Packet8f& _x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
-  const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
-  _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
-
-  // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
-
-  // Since the polynomials are odd/even, we need x^2.
-  const Packet8f x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
-  p = pmadd(x2, p, p8f_alpha_9);
-  p = pmadd(x2, p, p8f_alpha_7);
-  p = pmadd(x2, p, p8f_alpha_5);
-  p = pmadd(x2, p, p8f_alpha_3);
-  p = pmadd(x2, p, p8f_alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  Packet8f q = pmadd(x2, p8f_beta_6, p8f_beta_4);
-  q = pmadd(x2, q, p8f_beta_2);
-  q = pmadd(x2, q, p8f_beta_0);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
+ptanh<Packet8f>(const Packet8f& x) {
+  return internal::generic_fast_tanh_float(x);
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index ba2a6c1e1..beb3e577d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -97,6 +97,9 @@ template<> struct packet_traits<double> : default_packet_traits
 };
 #endif
 
+template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+
 /* Proper support for integers is only provided by AVX2. In the meantime, we'll
    use SSE instructions and packets to deal with integers.
 template<> struct packet_traits<int>    : default_packet_traits
@@ -156,7 +159,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
 
 #ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
   // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
   // and gcc stupidly generates a vfmadd132ps instruction,
   // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
@@ -169,7 +172,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
   // see above
   Packet4d res = c;
   __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
diff --git a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt b/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
deleted file mode 100644
index 9f8d2e9c4..000000000
--- a/Eigen/src/Core/arch/AltiVec/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_AltiVec_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_AltiVec_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 58c296171..45213f791 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,18 +16,20 @@ namespace Eigen {
 namespace internal {
 
 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef _BIG_ENDIAN
+#ifdef __VSX__
+#if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #else
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 #endif
+#endif
 
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
   Packet4f  v;
 };
@@ -39,6 +42,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -49,6 +53,9 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
+#ifdef __VSX__
+    HasBlend  = 1,
+#endif
     HasSetLinear = 0
   };
 };
@@ -58,7 +65,6 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
   if((ptrdiff_t(&from) % 16) == 0)
     res.v = pload<Packet4f>((const float *)&from);
   else
@@ -67,26 +73,32 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
   std::complex<float> EIGEN_ALIGN16 af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
-  return Packet2cf(vec_ld(0, (const float*)af));
+  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
   std::complex<float> EIGEN_ALIGN16 af[2];
-  vec_st(from.v, 0, (float*)af);
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -100,30 +112,19 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   v1 = vec_madd(v1, b.v, p4f_ZERO);
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
+  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
   // permute back to a proper order
   v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
   
-  return Packet2cf(vec_add(v1, v2));
+  return Packet2cf(padd<Packet4f>(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
-{
-  return pset1<Packet2cf>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -143,23 +144,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
-  return pfirst(Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
   Packet4f b1, b2;
 #ifdef _BIG_ENDIAN  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
 #else
-  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
 #endif
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
+  b2 = vec_sld(b2, b2, 8);
+  b2 = padd<Packet4f>(b1, b2);
 
   return Packet2cf(b2);
 }
@@ -168,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
 {
   Packet4f b;
   Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
 
-  return pfirst(prod);
+  return pfirst<Packet2cf>(prod);
 }
 
 template<int Offset>
@@ -223,12 +224,30 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
+template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cf, Packet4f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
+  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
+};
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@@ -243,6 +262,14 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
   kernel.packet[0].v = tmp;
 }
 
+#ifdef __VSX__
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
 //---------- double ----------
 #ifdef __VSX__
 struct Packet1cd
@@ -277,10 +304,10 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 
 template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
@@ -300,10 +327,10 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
   to[1*stride] = af[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@@ -317,23 +344,20 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   v1 = vec_madd(a_re, b.v, p2d_ZERO);
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
-  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
 
-  return Packet1cd(vec_add(v1, v2));
+  return Packet1cd(padd<Packet2d>(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
-{
-  return pset1<Packet1cd>(*from);
-}
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@@ -345,20 +369,10 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
 template<int Offset>
 struct palign_impl<Offset,Packet1cd>
@@ -402,13 +416,30 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
     return pconj(internal::pmul(a, b));
   }
 };
+template<> struct conj_helper<Packet2d, Packet1cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
+};
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for AltiVec
   Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 9e37e93f8..5511245dd 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -19,38 +20,79 @@ namespace Eigen {
 
 namespace internal {
 
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+#ifdef __VSX__
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+static Packet2l p2l_1023 = { 1023, 1023 };
+static Packet2ul p2ul_52 = { 52, 52 };
+
+#endif
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
   Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-  
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
 
   Packet4i emm0;
 
@@ -112,36 +154,17 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
   Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 
   Packet4f tmp, fx;
   Packet4i emm0;
 
   // clamp x
-  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
 
-  /* express exp(x) as exp(g + n*log(2)) */
+  // express exp(x) as exp(g + n*log(2))
   fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
 
-  fx = vec_floor(fx);
+  fx = pfloor(fx);
 
   tmp = pmul(fx, p4f_cephes_exp_C1);
   Packet4f z = pmul(fx, p4f_cephes_exp_C2);
@@ -171,14 +194,44 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
                  isnumber_mask);
 }
 
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
 #ifdef __VSX__
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_sqrt(x);
+}
+
 // VSX support varies between different compilers and even different
 // versions of the same compiler.  For gcc version >= 4.9.3, we can use
 // vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
 // a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
 static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 0) || \
-    (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
   return vec_cts(x, 0);    // TODO: check clang version.
 #else
   double tmp[2];
@@ -194,36 +247,16 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
   Packet2d x = _x;
 
-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
   Packet2d tmp, fx;
   Packet2l emm0;
 
   // clamp x
   x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
 
-  fx = vec_floor(fx);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
+
+  fx = pfloor(fx);
 
   tmp = pmul(fx, p2d_cephes_exp_C1);
   Packet2d z = pmul(fx, p2d_cephes_exp_C2);
@@ -249,9 +282,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   emm0 = ConvertToPacket2l(fx);
 
 #ifdef __POWER8_VECTOR__ 
-  static const Packet2l p2l_1023 = { 1023, 1023 };
-  static const Packet2ul p2ul_52 = { 52, 52 };
-
   emm0 = vec_add(emm0, p2l_1023);
   emm0 = vec_sl(emm0, p2ul_52);
 #else
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 0dbbc2e42..cbfef3503 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -42,7 +42,7 @@ typedef __vector unsigned char  Packet16uc;
 // and it doesn't really work to declare them global, so we define macros instead
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = vec_splat_s32(X)
@@ -69,13 +69,13 @@ typedef __vector unsigned char  Packet16uc;
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-#ifndef __VSX__
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
-#endif
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+#ifndef __VSX__
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif
 
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
@@ -95,8 +95,10 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+#ifdef __VSX__
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif
 static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
@@ -110,8 +112,8 @@ static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i
 
 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
@@ -121,6 +123,12 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8
 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif // _BIG_ENDIAN
 
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
@@ -129,15 +137,35 @@ template<> struct packet_traits<float>  : default_packet_traits
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket=0,
+    HasHalfPacket = 1,
 
-    // FIXME check the Has*
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
     HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
     HasSin  = 0,
     HasCos  = 0,
-    HasLog  = 1,
+    HasLog  = 0,
     HasExp  = 1,
-    HasSqrt = 0
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
 template<> struct packet_traits<int>    : default_packet_traits
@@ -145,10 +173,16 @@ template<> struct packet_traits<int>    : default_packet_traits
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
   };
 };
 
@@ -200,41 +234,56 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
   return s;
 }
-/*
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
-{
-  union {
-    Packet4bi v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
-}*/
-
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from;
-  Packet4f vc = pload<Packet4f>(af);
-  vc = vec_splat(vc, 0);
-  return vc;
+  Packet4f v = {from, from, from, from};
+  return v;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from;
-  Packet4i vc = pload<Packet4i>(ai);
-  vc = vec_splat(vc, 0);
-  return vc;
+  Packet4i v = {from, from, from, from};
+  return v;
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
@@ -294,58 +343,24 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[3*stride] = ai[3];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
-  //Set up constants, variables
-  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
 
-  // Get the absolute values
-  a1  = vec_abs(a);
-  b1  = vec_abs(b);
-
-  // Get the signs using xor
-  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
-
-  // Do the multiplication for the asbolute values.
-  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
-  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
-  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
-  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
-  prod = vec_add( low_prod, high_prod );
-
-  // NOR the product and select only the negative elements according to the sign mask
-  prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
-
-  // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
-  prod_ = vec_add(prod_, v1sel);
-
-  // Merge the results back to the final vector.
-  prod = vec_sel(prod, prod_, sgn);
-
-  return prod;
-}
-*/
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #ifndef __VSX__  // VSX actually provides a div instruction
@@ -370,8 +385,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
@@ -391,6 +406,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+
 #ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
@@ -418,12 +437,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
   return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
   return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
 }
 #endif
@@ -494,16 +513,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f&
 }
 #endif
 
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@@ -511,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
-  sum = vec_add(a, b);
-  b   = (Packet4f) vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
+  b   = vec_sld(a, a, 8);
+  sum = a + b;
+  b   = vec_sld(sum, sum, 4);
+  sum += b;
   return pfirst(sum);
 }
 
@@ -537,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 
   // Now do the summation:
   // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
   // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
   // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
 
   return sum[0];
 }
@@ -577,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 
   // Now do the summation:
   // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
   // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
+  sum[1] = sum[2] + sum[3];
   // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+  sum[0] = sum[0] + sum[1];
 
   return sum[0];
 }
@@ -591,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -716,33 +738,52 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
 
 //---------- double ----------
 #ifdef __VSX__
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
-
-static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
-static Packet2d p2d_ONE = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
-
-#ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+#if EIGEN_COMP_CLANG
+typedef Packet2ul                    Packet2bl;
 #else
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+typedef __vector __bool long         Packet2bl;
 #endif
 
-static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+static Packet2l  p2l_ONE  = { 1, 1 };
+static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d  p2d_ZERO_ = { -0.0, -0.0 };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template<int index> Packet2d vec_splat_dbl(Packet2d& a);
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
 {
-  switch (index) {
-  case 0:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
-  case 1:
-    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
-  }
-  return a;
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
 }
 
 template<> struct packet_traits<double> : default_packet_traits
@@ -753,16 +794,41 @@ template<> struct packet_traits<double> : default_packet_traits
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
-    HasHalfPacket = 0,
+    HasHalfPacket = 1,
 
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
     HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
     HasExp  = 1,
-    HasSqrt = 0
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
 
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  union {
+    Packet2l   v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
 
 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 {
@@ -776,28 +842,43 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 }
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
-  double EIGEN_ALIGN16 af[2];
-  af[0] = from;
-  Packet2d vc = pload<Packet2d>(af);
-  vc = vec_splat_dbl(vc, 0);
-  return vc;
+  Packet2d v = {from, from};
+  return v;
 }
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
   a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl(a1, 0);
-  a1 = vec_splat_dbl(a1, 1);
+  a0 = vec_splat_dbl<0>(a1);
+  a1 = vec_splat_dbl<1>(a1);
   a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl(a3, 0);
-  a3 = vec_splat_dbl(a3, 1);
+  a2 = vec_splat_dbl<0>(a3);
+  a3 = vec_splat_dbl<1>(a3);
 }
+
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
   double EIGEN_ALIGN16 af[2];
@@ -812,13 +893,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
   to[0*stride] = af[0];
   to[1*stride] = af[1];
 }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
 
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
 
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
@@ -840,17 +922,22 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
 }
+
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
   Packet2d p;
   if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
   else                             p = ploadu<Packet2d>(from);
-  return vec_perm(p, p, p16uc_PSET64_HI);
+  return vec_splat_dbl<0>(p);
 }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
@@ -859,32 +946,34 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
   vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
 
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
 
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
   Packet2d b, sum;
-  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
-  sum = vec_add(a, b);
-  return pfirst(sum);
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
   Packet2d v[2], sum;
-  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
-  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
+  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
+  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
  
 #ifdef _BIG_ENDIAN
- sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
-  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
 #endif
 
   return sum;
@@ -893,19 +982,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // max
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 template<int Offset>
@@ -915,9 +1004,9 @@ struct palign_impl<Offset,Packet2d>
   {
     if (Offset == 1)
 #ifdef _BIG_ENDIAN
-      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
 #else
-      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
 #endif
   }
 };
@@ -931,6 +1020,11 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
   kernel.packet[1] = t1;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
 #endif // __VSX__
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt
deleted file mode 100644
index da9793eca..000000000
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-ADD_SUBDIRECTORY(AltiVec)
-ADD_SUBDIRECTORY(AVX)
-ADD_SUBDIRECTORY(AVX512)
-ADD_SUBDIRECTORY(CUDA)
-ADD_SUBDIRECTORY(Default)
-ADD_SUBDIRECTORY(NEON)
-ADD_SUBDIRECTORY(SSE)
-
-
-
diff --git a/Eigen/src/Core/arch/CUDA/CMakeLists.txt b/Eigen/src/Core/arch/CUDA/CMakeLists.txt
deleted file mode 100644
index 7ba28da7c..000000000
--- a/Eigen/src/Core/arch/CUDA/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_CUDA_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
new file mode 100644
index 000000000..9c2536509
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CUDA_H
+#define EIGEN_COMPLEX_CUDA_H
+
+// clang-format off
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, clang does not treat them as device
+// functions and thus Eigen functors making use of these operators fail to
+// compile. Here, we manually specialize these functors for complex types when
+// building for CUDA to avoid non-constexpr methods.
+
+// Sum
+template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) + numext::real(b),
+                           numext::imag(a) + numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Difference
+template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) - numext::real(b),
+                           numext::imag(a) - numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Product
+template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    return std::complex<T>(a_real * b_real - a_imag * b_imag,
+                           a_real * b_imag + a_imag * b_real);
+  }
+};
+
+template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Quotient
+template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+                           (a_imag * b_real - a_real * b_imag) * norm);
+  }
+};
+
+template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 060c2c805..52892db38 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -1,11 +1,3 @@
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// class Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-//
-//
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
@@ -32,6 +24,15 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
 #ifndef EIGEN_HALF_CUDA_H
 #define EIGEN_HALF_CUDA_H
 
@@ -42,92 +43,93 @@
 #endif
 
 
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
 #if !defined(EIGEN_HAS_CUDA_FP16)
 
 // Make our own __half definition that is similar to CUDA's.
 struct __half {
-  __half() {}
-  explicit __half(unsigned short raw) : x(raw) {}
+  EIGEN_DEVICE_FUNC __half() {}
+  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
 
 #endif
 
-namespace Eigen {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
 
-namespace internal {
+struct half_base : public __half {
+  EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+};
 
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
-
-} // end namespace internal
+} // namespace half_impl
 
 // Class definition.
-struct half : public __half {
+struct half : public half_impl::half_base {
+  #if !defined(EIGEN_HAS_CUDA_FP16)
+    typedef half_impl::__half __half;
+  #endif
+
   EIGEN_DEVICE_FUNC half() {}
 
-  EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
-      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  explicit EIGEN_DEVICE_FUNC half(unsigned int ui)
-      : __half(internal::float_to_half_rtne(static_cast<float>(ui))) {}
-  explicit EIGEN_DEVICE_FUNC half(int i)
-      : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
-  explicit EIGEN_DEVICE_FUNC half(unsigned long ul)
-      : __half(internal::float_to_half_rtne(static_cast<float>(ul))) {}
-  explicit EIGEN_DEVICE_FUNC half(long l)
-      : __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
-  explicit EIGEN_DEVICE_FUNC half(long long ll)
-      : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
-  explicit EIGEN_DEVICE_FUNC half(unsigned long long ull)
-      : __half(internal::float_to_half_rtne(static_cast<float>(ull))) {}
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC half(const T& val)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
   explicit EIGEN_DEVICE_FUNC half(float f)
-      : __half(internal::float_to_half_rtne(f)) {}
-  explicit EIGEN_DEVICE_FUNC half(double d)
-      : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
 
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
     // +0.0 and -0.0 become false, everything else becomes true.
     return (x & 0x7fff) != 0;
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(internal::half_to_float(*this));
+    return static_cast<signed char>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(internal::half_to_float(*this));
+    return static_cast<unsigned char>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(internal::half_to_float(*this));
+    return static_cast<short>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(internal::half_to_float(*this));
+    return static_cast<unsigned short>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(internal::half_to_float(*this));
+    return static_cast<int>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(internal::half_to_float(*this));
+    return static_cast<unsigned int>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(internal::half_to_float(*this));
+    return static_cast<long>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(internal::half_to_float(*this));
+    return static_cast<unsigned long>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(internal::half_to_float(*this));
+    return static_cast<long long>(half_impl::half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(internal::half_to_float(*this));
+    return static_cast<unsigned long long>(half_to_float(*this));
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return internal::half_to_float(*this);
+    return half_impl::half_to_float(*this);
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(internal::half_to_float(*this));
+    return static_cast<double>(half_impl::half_to_float(*this));
   }
 
   EIGEN_DEVICE_FUNC half& operator=(const half& other) {
@@ -136,6 +138,8 @@ struct half : public __half {
   }
 };
 
+namespace half_impl {
+
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 
 // Intrinsics for native fp16 support. Note that on current hardware,
@@ -200,55 +204,55 @@ __device__ bool operator >= (const half& a, const half& b) {
 // Definitions for CPUs and older CUDA, mostly working through conversion
 // to/from fp32.
 
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
   return half(float(a) + float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
   return half(float(a) * float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
   return half(float(a) - float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
   return half(float(a) / float(b));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
   half result;
   result.x = a.x ^ 0x8000;
   return result;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
   a = half(float(a) + float(b));
   return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
   a = half(float(a) * float(b));
   return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
   a = half(float(a) - float(b));
   return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
   a = half(float(a) / float(b));
   return a;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
   return float(a) == float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
   return float(a) != float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
   return float(a) < float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
   return float(a) <= float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
   return float(a) > float(b);
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
   return float(a) >= float(b);
 }
 
@@ -256,8 +260,8 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, co
 
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to half.
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
-  return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
 }
 
 // Conversion routines, including fallbacks for the host or older CUDA.
@@ -265,9 +269,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Ind
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
 
-namespace internal {
-
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
   __half h;
   h.x = x;
   return h;
@@ -278,7 +280,7 @@ union FP32 {
   float f;
 };
 
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   return __float2half(ff);
 
@@ -333,7 +335,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff)
 #endif
 }
 
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   return __half2float(h);
 
@@ -362,92 +364,69 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
 #endif
 }
 
-} // end namespace internal
+// --- standard functions ---
 
-// Traits.
-
-namespace internal {
-
-template<> struct is_arithmetic<half> { enum { value = true }; };
-
-} // end namespace internal
-
-template<> struct NumTraits<Eigen::half>
-    : GenericNumTraits<Eigen::half>
-{
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return internal::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return internal::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return internal::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return internal::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return internal::raw_uint16_to_half(0x7c01);
-  }
-};
-
-// Infinity/NaN checks.
-
-namespace numext {
-
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(a);
 #else
   return (a.x & 0x7fff) > 0x7c00;
 #endif
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) {
-  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
 }
 
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) {
-  Eigen::half result;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+  half result;
   result.x = a.x & 0x7FFF;
   return result;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+  return half(::expf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
-  return Eigen::half(::logf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return Eigen::half(::hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+  return half(numext::log1p(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) {
-  return Eigen::half(::sinf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+  return half(::sqrtf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) {
-  return Eigen::half(::cosf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) {
-  return Eigen::half(::tanf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) {
-  return Eigen::half(::tanhf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+  return half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+  return half(::ceilf(float(a)));
 }
 
-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(b, a) ? b : a;
 #else
@@ -456,7 +435,7 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::
   return f2 < f1 ? b : a;
 #endif
 }
-template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b) ? b : a;
 #else
@@ -466,78 +445,89 @@ template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::
 #endif
 }
 
-#ifdef EIGEN_HAS_C99_MATH
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
-  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
-}
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
-}
-#endif
-} // end namespace numext
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+  static inline half run(const half& x, const half& y)
+  {
+    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run()
+  {
+    return run(half(-1.f), half(1.f));
+  }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7c01);
+  }
+};
 
 } // end namespace Eigen
 
-// Standard mathematical functions and trancendentals.
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
+// C-like standard mathematical functions and trancendentals.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
   Eigen::half result;
   result.x = a.x & 0x7FFF;
   return result;
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return Eigen::half(::hlog(a));
+#else
   return Eigen::half(::logf(float(a)));
+#endif
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
   return Eigen::half(::sqrtf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
   return Eigen::half(::powf(float(a), float(b)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
   return Eigen::half(::floorf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
   return Eigen::half(::ceilf(float(a)));
 }
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) {
-  return (Eigen::numext::isnan)(a);
-}
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) {
-  return (Eigen::numext::isinf)(a);
-}
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) {
-  return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
-}
-
 
 namespace std {
 
-EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
 #if __cplusplus > 199711L
 template <>
 struct hash<Eigen::half> {
@@ -551,19 +541,45 @@ struct hash<Eigen::half> {
 
 
 // Add the missing shfl_xor intrinsic
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
 }
 #endif
 
 // ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
-static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::internal::raw_uint16_to_half(
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
 }
 #endif
 
 
+#if defined(__CUDA_ARCH__)
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+} // namespace Eigen
+}  // namespace numext
+#endif
+
 #endif // EIGEN_HALF_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index 317499b29..0348b41db 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -27,9 +27,22 @@ float4 plog<float4>(const float4& a)
 template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 plog<double2>(const double2& a)
 {
+  using ::log;
   return make_double2(log(a.x), log(a.y));
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog1p<float4>(const float4& a)
+{
+  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog1p<double2>(const double2& a)
+{
+  return make_double2(log1p(a.x), log1p(a.y));
+}
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pexp<float4>(const float4& a)
 {
@@ -39,6 +52,7 @@ float4 pexp<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pexp<double2>(const double2& a)
 {
+  using ::exp;
   return make_double2(exp(a.x), exp(a.y));
 }
 
@@ -51,6 +65,7 @@ float4 psqrt<float4>(const float4& a)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 psqrt<double2>(const double2& a)
 {
+  using ::sqrt;
   return make_double2(sqrt(a.x), sqrt(a.y));
 }
 
@@ -66,120 +81,6 @@ double2 prsqrt<double2>(const double2& a)
   return make_double2(rsqrt(a.x), rsqrt(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
-  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
-  return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pdigamma<float4>(const float4& a)
-{
-  using numext::digamma;
-  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pdigamma<double2>(const double2& a)
-{
-  using numext::digamma;
-  return make_double2(digamma(a.x), digamma(a.y));
-}
-    
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pzeta<float4>(const float4& x, const float4& q)
-{
-    using numext::zeta;
-    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pzeta<double2>(const double2& x, const double2& q)
-{
-    using numext::zeta;
-    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 ppolygamma<float4>(const float4& n, const float4& x)
-{
-    using numext::polygamma;
-    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 ppolygamma<double2>(const double2& n, const double2& x)
-{
-    using numext::polygamma;
-    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
-  return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
-  return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
-  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
-  return make_double2(erfc(a.x), erfc(a.y));
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigamma<float4>(const float4& a, const float4& x)
-{
-  using numext::igamma;
-  return make_float4(
-      igamma(a.x, x.x),
-      igamma(a.y, x.y),
-      igamma(a.z, x.z),
-      igamma(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigamma<double2>(const double2& a, const double2& x)
-{
-  using numext::igamma;
-  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigammac<float4>(const float4& a, const float4& x)
-{
-  using numext::igammac;
-  return make_float4(
-      igammac(a.x, x.x),
-      igammac(a.y, x.y),
-      igammac(a.z, x.z),
-      igammac(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double2& x)
-{
-  using numext::igammac;
-  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
-}
 
 #endif
 
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 932df1092..ad66399e0 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -44,8 +44,9 @@ template<> struct packet_traits<float> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
-    HasIgamma = 1,
+    HasIGamma = 1,
     HasIGammac = 1,
+    HasBetaInc = 1,
 
     HasBlend = 0,
   };
@@ -68,10 +69,13 @@ template<> struct packet_traits<double> : default_packet_traits
     HasRsqrt = 1,
     HasLGamma = 1,
     HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
     HasIGamma = 1,
     HasIGammac = 1,
+    HasBetaInc = 1,
 
     HasBlend = 0,
   };
@@ -278,35 +282,6 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
   return a.x * a.y;
 }
 
-template<size_t offset>
-struct protate_impl<offset, float4>
-{
-  static float4 run(const float4& a) {
-    if (offset == 0) {
-      return make_float4(a.x, a.y, a.z, a.w);
-    }
-    if (offset == 1) {
-      return make_float4(a.w, a.x, a.y, a.z);
-    }
-    if (offset == 2) {
-      return make_float4(a.z, a.w, a.x, a.y);
-    }
-    return make_float4(a.y, a.z, a.w, a.x);
-  }
-};
-
-template<size_t offset>
-struct protate_impl<offset, double2>
-{
-  static double2 run(const double2& a) {
-    if (offset == 0) {
-      return make_double2(a.x, a.y);
-    }
-    return make_double2(a.y, a.x);
-  }
-};
-
-
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 61d532e4d..82dfc12c9 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -10,22 +10,16 @@
 #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
 #define EIGEN_PACKET_MATH_HALF_CUDA_H
 
-#if defined(EIGEN_HAS_CUDA_FP16)
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-
-// Most of the following operations require arch >= 5.3
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 
 namespace Eigen {
 namespace internal {
 
+// Most of the following operations require arch >= 3.0
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
 template<> struct is_arithmetic<half2> { enum { value = true }; };
 
-template<> struct packet_traits<half> : default_packet_traits
+template<> struct packet_traits<Eigen::half> : default_packet_traits
 {
   typedef half2 type;
   typedef half2 half;
@@ -34,105 +28,172 @@ template<> struct packet_traits<half> : default_packet_traits
     AlignedOnScalar = 1,
     size=2,
     HasHalfPacket = 0,
-    HasDiv  = 1
+    HasAdd    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
   };
 };
 
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
 
-template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
   return __half2half2(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
   return *reinterpret_cast<const half2*>(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
   return __halves2half2(from[0], from[1]);
 }
 
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half*  from) {
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
   return __halves2half2(from[0], from[0]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }
 
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
-  return __ldg((const half2*)from);
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __ldg((const half2*)from);
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
 }
 
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
   return __halves2half2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
   to[stride*0] = __low2half(from);
   to[stride*1] = __high2half(from);
 }
 
-template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
   return __low2half(a);
 }
 
-template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
   half2 result;
   result.x = a.x & 0x7FFF7FFF;
   return result;
 }
 
 
-EIGEN_DEVICE_FUNC inline void
+__device__ EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  half a1 = __low2half(kernel.packet[0]);
-  half a2 = __high2half(kernel.packet[0]);
-  half b1 = __low2half(kernel.packet[1]);
-  half b2 = __high2half(kernel.packet[1]);
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
   kernel.packet[0] = __halves2half2(a1, b1);
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if __CUDA_ARCH__ >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
    return __hfma2(a, b, c);
- }
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -142,51 +203,529 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
   return __floats2half2_rn(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
-  half first = __low2half(a);
-  half second = __high2half(a);
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
   return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
-  half first = __low2half(a);
-  half second = __high2half(a);
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
   return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+#endif
 }
 
-} // end namespace internal
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
 
-} // end namespace Eigen
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+
+template<>  __device__ EIGEN_STRONG_INLINE
+half2 plog<half2>(const half2& a) {
+  return h2log(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 pexp<half2>(const half2& a) {
+  return h2exp(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 psqrt<half2>(const half2& a) {
+  return h2sqrt(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 prsqrt<half2>(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
 
 #endif
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+typedef struct {
+  __m128i x;
+} Packet8h;
+
+
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  Packet8h result;
+  result.x = _mm_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
 #endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet8h result;
+  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+  Eigen::half h4(aux[4]);
+  Eigen::half h5(aux[5]);
+  Eigen::half h6(aux[6]);
+  Eigen::half h7(aux[7]);
+
+  Packet8h result;
+  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
 #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  Packet8h result;
+  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0].x;
+  __m128i b = kernel.packet[1].x;
+  __m128i c = kernel.packet[2].x;
+  __m128i d = kernel.packet[3].x;
+  __m128i e = kernel.packet[4].x;
+  __m128i f = kernel.packet[5].x;
+  __m128i g = kernel.packet[6].x;
+  __m128i h = kernel.packet[7].x;
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
+  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
+  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
+  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
+  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
+  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
+  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
+  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+}
+}
+
 #endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index 396b38eaf..31f1c523a 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -14,50 +14,48 @@ namespace Eigen {
 
 namespace internal {
 
-#if defined(EIGEN_HAS_CUDA_FP16)
-
 template<>
-struct scalar_cast_op<float, half> {
+struct scalar_cast_op<float, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
       return __float2half(a);
     #else
-      return half(a);
+      return Eigen::half(a);
     #endif
   }
 };
 
 template<>
-struct functor_traits<scalar_cast_op<float, half> >
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 
 
 template<>
-struct scalar_cast_op<int, half> {
+struct scalar_cast_op<int, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
       return __float2half(static_cast<float>(a));
     #else
-      return half(static_cast<float>(a));
+      return Eigen::half(static_cast<float>(a));
     #endif
   }
 };
 
 template<>
-struct functor_traits<scalar_cast_op<int, half> >
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 
 
 template<>
-struct scalar_cast_op<half, float> {
+struct scalar_cast_op<Eigen::half, float> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
       return __half2float(a);
     #else
       return static_cast<float>(a);
@@ -66,15 +64,15 @@ struct scalar_cast_op<half, float> {
 };
 
 template<>
-struct functor_traits<scalar_cast_op<half, float> >
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
 { enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
 
 
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
 template <>
-struct type_casting_traits<half, float> {
+struct type_casting_traits<Eigen::half, float> {
   enum {
     VectorizedCast = 1,
     SrcCoeffRatio = 2,
@@ -89,7 +87,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(con
 }
 
 template <>
-struct type_casting_traits<float, half> {
+struct type_casting_traits<float, Eigen::half> {
   enum {
     VectorizedCast = 1,
     SrcCoeffRatio = 1,
@@ -97,12 +95,87 @@ struct type_casting_traits<float, half> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
   // Simply discard the second half of the input
-  return __float22half2_rn(make_float2(a.x, a.y));
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
 }
 
-#endif
 #endif
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/Default/CMakeLists.txt b/Eigen/src/Core/arch/Default/CMakeLists.txt
deleted file mode 100644
index 339c091d1..000000000
--- a/Eigen/src/Core/arch/Default/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_Default_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_Default_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/Default COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/NEON/CMakeLists.txt b/Eigen/src/Core/arch/NEON/CMakeLists.txt
deleted file mode 100644
index fd4d4af50..000000000
--- a/Eigen/src/Core/arch/NEON/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_NEON_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index d2d467936..3e121dce5 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,8 +15,15 @@ namespace Eigen {
 
 namespace internal {
 
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+inline uint32x4_t p4ui_CONJ_XOR() {
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return vld1q_u32( conj_XOR_DATA );
+}
+
+inline uint32x2_t p2ui_CONJ_XOR() {
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
+  return vld1_u32( conj_XOR_DATA );
+}
 
 //---------- float ----------
 struct Packet2cf
@@ -64,7 +72,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
   Packet4ui b = vreinterpretq_u32_f32(a.v);
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -80,7 +88,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
   // Conjugate v2 
-  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
   // Add and return the result
@@ -195,7 +203,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
   // Conjugate v2 
-  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
@@ -274,7 +282,8 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
-static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
+static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
 
 struct Packet1cd
 {
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 3224c36bd..2a8f58d74 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -49,17 +49,6 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
-  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
-#else
-  //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
-#endif
-
-
 // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
 // which available on LLVM and GCC (at least)
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -122,12 +111,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
 
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const float32_t f[] = {0, 1, 2, 3};
+  Packet4f countdown = vld1q_f32(f);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
 {
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
+  const int32_t i[] = {0, 1, 2, 3};
+  Packet4i countdown = vld1q_s32(i);
   return vaddq_s32(pset1<Packet4i>(a), countdown);
 }
 
@@ -334,22 +325,6 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
   return vcombine_s32(a_hi, a_lo);
 }
 
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
-  static Packet4f run(const Packet4f& a) {
-    return vextq_f32(a, a, offset);
-  }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
-  static Packet4i run(const Packet4i& a) {
-    return vextq_s32(a, a, offset);
-  }
-};
-
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
 
@@ -601,7 +576,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { r
 
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
+  const double countdown_raw[] = {0.0,1.0};
+  const Packet2d countdown = vld1q_f64(countdown_raw);
   return vaddq_f64(pset1<Packet2d>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
@@ -679,14 +655,6 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu
 
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
 
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
-  static Packet2d run(const Packet2d& a) {
-    return vextq_f64(a, a, offset);
-  }
-};
-
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
diff --git a/Eigen/src/Core/arch/SSE/CMakeLists.txt b/Eigen/src/Core/arch/SSE/CMakeLists.txt
deleted file mode 100644
index 46ea7cc62..000000000
--- a/Eigen/src/Core/arch/SSE/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_SSE_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 28f103eeb..ac2fd8103 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -517,52 +517,10 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
 }
 
 // Hyperbolic Tangent function.
-// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
-// fl(tanh(x)) = +/-1.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
-  const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
-
-  // The monomial coefficients of the numerator polynomial (odd).
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
-  _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
-
-  // The monomial coefficients of the denominator polynomial (even).
-  _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
-  _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
-
-  // Since the polynomials are odd/even, we need x^2.
-  const Packet4f x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
-  p = pmadd(x2, p, p4f_alpha_9);
-  p = pmadd(x2, p, p4f_alpha_7);
-  p = pmadd(x2, p, p4f_alpha_5);
-  p = pmadd(x2, p, p4f_alpha_3);
-  p = pmadd(x2, p, p4f_alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
-  q = pmadd(x2, q, p4f_beta_2);
-  q = pmadd(x2, q, p4f_beta_0);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 451034560..baad692e3 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -162,6 +162,11 @@ template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4,
 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+#endif
+
 #if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
@@ -434,30 +439,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
-  static Packet4f run(const Packet4f& a) {
-    return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
-  }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
-  static Packet4i run(const Packet4i& a) {
-    return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
-  }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
-  static Packet2d run(const Packet2d& a) {
-    return vec2d_swizzle1(a, offset, (offset + 1) % 2);
-  }
-};
-
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
@@ -837,6 +818,16 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
 #endif
 }
 
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a,b,c);
+}
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/ZVector/CMakeLists.txt b/Eigen/src/Core/arch/ZVector/CMakeLists.txt
deleted file mode 100644
index 5eb0957eb..000000000
--- a/Eigen/src/Core/arch/ZVector/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_arch_ZVector_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_arch_ZVector_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/ZVector COMPONENT Devel
-)
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 9a8735ac1..e9d83eca6 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -57,21 +57,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
-{
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
-{
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index d55ae6096..9b373c783 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -18,20 +18,24 @@ namespace internal {
   * \brief Template functor for scalar/packet assignment
   *
   */
-template<typename Scalar> struct assign_op {
+template<typename DstScalar,typename SrcScalar> struct assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
 };
-template<typename Scalar>
-struct functor_traits<assign_op<Scalar> > {
+
+// Empty overload for void type (used by PermutationMatrix
+template<typename DstScalar> struct assign_op<DstScalar,void> {};
+
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::Vectorizable
+    Cost = NumTraits<DstScalar>::ReadCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && packet_traits<SrcScalar>::Vectorizable
   };
 };
 
@@ -39,20 +43,20 @@ struct functor_traits<assign_op<Scalar> > {
   * \brief Template functor for scalar/packet assignment with addition
   *
   */
-template<typename Scalar> struct add_assign_op {
+template<typename DstScalar,typename SrcScalar> struct add_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<add_assign_op<Scalar> > {
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
   };
 };
 
@@ -60,20 +64,20 @@ struct functor_traits<add_assign_op<Scalar> > {
   * \brief Template functor for scalar/packet assignment with subtraction
   *
   */
-template<typename Scalar> struct sub_assign_op {
+template<typename DstScalar,typename SrcScalar> struct sub_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<sub_assign_op<Scalar> > {
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<sub_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasSub
   };
 };
 
@@ -98,30 +102,28 @@ struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
     PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
   };
 };
-template<typename DstScalar,typename SrcScalar> struct functor_is_product_like<mul_assign_op<DstScalar,SrcScalar> > { enum { ret = 1 }; };
 
 /** \internal
   * \brief Template functor for scalar/packet assignment with diviving
   *
   */
-template<typename Scalar> struct div_assign_op {
+template<typename DstScalar, typename SrcScalar=DstScalar> struct div_assign_op {
 
   EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
   
   template<int Alignment, typename Packet>
-  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
-  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
 };
-template<typename Scalar>
-struct functor_traits<div_assign_op<Scalar> > {
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<div_assign_op<DstScalar,SrcScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasDiv
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasDiv
   };
 };
 
-
 /** \internal
   * \brief Template functor for scalar/packet assignment with swapping
   *
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 5cd8ca950..d82ffed02 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -16,27 +16,43 @@ namespace internal {
 
 //---------- associative binary functors ----------
 
+template<typename Arg1, typename Arg2>
+struct binary_op_base
+{
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+};
+
 /** \internal
   * \brief Template functor to compute the sum of two scalars
   *
   * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
   */
-template<typename Scalar> struct scalar_sum_op {
-//   typedef Scalar result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+#else
+  scalar_sum_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::padd(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
+    // TODO vectorize mixed sum
   };
 };
 
@@ -45,7 +61,7 @@ struct functor_traits<scalar_sum_op<Scalar> > {
   * This is required to solve Bug 426.
   * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
   */
-template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
+template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
   EIGEN_DEPRECATED
   scalar_sum_op() {}
 };
@@ -56,13 +72,17 @@ template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
   *
   * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+#else
+  scalar_product_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
@@ -75,7 +95,8 @@ template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   enum {
     Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+    // TODO vectorize mixed product
   };
 };
 
@@ -84,13 +105,15 @@ struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   *
   * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
 
   enum {
     Conj = NumTraits<LhsScalar>::IsComplex
   };
   
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
   
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
@@ -113,21 +136,24 @@ struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
   */
-template<typename Scalar> struct scalar_min_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmin(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_min(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
   };
 };
 
@@ -136,21 +162,24 @@ struct functor_traits<scalar_min_op<Scalar> > {
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
   */
-template<typename Scalar> struct scalar_max_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmax(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_max(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
   };
 };
 
@@ -158,56 +187,70 @@ struct functor_traits<scalar_max_op<Scalar> > {
   * \brief Template functors for comparison of two scalars
   * \todo Implement packet-comparisons
   */
-template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
 
-template<typename Scalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
     PacketAccess = false
   };
 };
 
-template<ComparisonName Cmp, typename Scalar>
-struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
+struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
   typedef bool type;
 };
 
 
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GT> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GE> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
 };
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
+{
   typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
 };
 
 
@@ -216,7 +259,9 @@ template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
   *
   * \sa MatrixBase::stableNorm(), class Redux
   */
-template<typename Scalar> struct scalar_hypot_op {
+template<typename Scalar>
+struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
+{
   EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
 //   typedef typename NumTraits<Scalar>::Real result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
@@ -237,12 +282,12 @@ template<typename Scalar> struct scalar_hypot_op {
   }
 };
 template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
+struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
   enum
   {
     Cost = 3 * NumTraits<Scalar>::AddCost +
            2 * NumTraits<Scalar>::MulCost +
-           2 * NumTraits<Scalar>::template Div<false>::Cost,
+           2 * scalar_div_cost<Scalar,false>::value,
     PacketAccess = false
   };
 };
@@ -250,13 +295,24 @@ struct functor_traits<scalar_hypot_op<Scalar> > {
 /** \internal
   * \brief Template functor to compute the pow of two scalars
   */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
+template<typename Scalar, typename Exponent>
+struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
+{
+  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
+#else
+  scalar_pow_op() {
+    typedef Scalar LhsScalar;
+    typedef Exponent RhsScalar;
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
   EIGEN_DEVICE_FUNC
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
+  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
 };
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
+template<typename Scalar, typename Exponent>
+struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
   enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
 };
 
@@ -269,18 +325,27 @@ struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
   *
   * \sa class CwiseBinaryOp, MatrixBase::operator-
   */
-template<typename Scalar> struct scalar_difference_op {
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+#else
+  scalar_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::psub(a,b); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
   };
 };
 
@@ -289,13 +354,17 @@ struct functor_traits<scalar_difference_op<Scalar> > {
   *
   * \sa class CwiseBinaryOp, Cwise::operator/()
   */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+#else
+  scalar_quotient_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
@@ -305,8 +374,8 @@ template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
   typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
   enum {
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
-    Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
+    Cost = scalar_div_cost<result_type,PacketAccess>::value
   };
 };
 
@@ -360,236 +429,50 @@ template<> struct functor_traits<scalar_boolean_xor_op> {
   };
 };
 
-/** \internal
-  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
-  *
-  * \sa class CwiseBinaryOp, Cwise::igamma
-  */
-template<typename Scalar> struct scalar_igamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
-    using numext::igamma; return igamma(a, x);
-  }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
-    return internal::pigammac(a, x);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_igamma_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasIGamma
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
-  *
-  * \sa class CwiseBinaryOp, Cwise::igammac
-  */
-template<typename Scalar> struct scalar_igammac_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
-    using numext::igammac; return igammac(a, x);
-  }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
-  {
-    return internal::pigammac(a, x);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_igammac_op<Scalar> > {
-  enum {
-    // Guesstimate
-    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasIGammac
-  };
-};
 
 
 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
 
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
+// They are analogues to std::binder1st/binder2nd but with the following differences:
+//  - they are compatible with packetOp
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+template<typename BinaryOp> struct bind1st_op : BinaryOp {
 
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
 
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+  bind1st_op(const first_argument_type &val) : m_value(val) {}
 
-template<typename Scalar1, typename Scalar2>
-struct scalar_quotient2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
-{ enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a fixed scalar to another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
-  */
-template<typename Scalar>
-struct scalar_sub_op {
-  EIGEN_DEVICE_FUNC inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_sub_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a - m_other; }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_sub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to subtract a scalar to fixed another one
-  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
-  */
-template<typename Scalar>
-struct scalar_rsub_op {
-  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
-  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other - a; }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::psub(pset1<Packet>(m_other), a); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_rsub_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_DEVICE_FUNC inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  EIGEN_DEVICE_FUNC inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  EIGEN_DEVICE_FUNC
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  EIGEN_DEVICE_FUNC scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
+  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
+
+  first_argument_type m_value;
 };
-template<typename Scalar>
-struct functor_traits<scalar_inverse_mult_op<Scalar> >
-{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
+template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
+
+
+template<typename BinaryOp> struct bind2nd_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  bind2nd_op(const second_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
+
+  second_argument_type m_value;
+};
+template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
 
 
 } // end namespace internal
diff --git a/Eigen/src/Core/functors/CMakeLists.txt b/Eigen/src/Core/functors/CMakeLists.txt
deleted file mode 100644
index f4b99a9c3..000000000
--- a/Eigen/src/Core/functors/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_Functor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_Functor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/functors COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index c5836d048..a2154d3b5 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -18,20 +18,20 @@ template<typename Scalar>
 struct scalar_constant_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; }
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1<PacketType>(m_other); }
   const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_constant_op<Scalar> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+{ enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
+         PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
 
 template<typename Scalar> struct scalar_identity_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
@@ -55,15 +55,15 @@ struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false>
     m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)),
     m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const
   { 
     m_base = padd(m_base, pset1<Packet>(m_step));
     return m_low+Scalar(i)*m_step; 
   }
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType) const { return m_base = padd(m_base,m_packetStep); }
 
   const Scalar m_low;
   const Scalar m_step;
@@ -81,11 +81,11 @@ struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false>
     m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
     m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return m_low+i*m_step; }
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
   { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
 
   const Scalar m_low;
@@ -99,24 +99,24 @@ template <typename Scalar, typename Packet>
 struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
 {
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
-    m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset<Packet>(0))
+    m_low(low), m_length(high-low), m_divisor(convert_index<Scalar>(num_steps==1?1:num_steps-1)), m_interPacket(plset<Packet>(0))
   {}
 
-  template<typename Index>
+  template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar operator() (Index i) const {
+  const Scalar operator() (IndexType i) const {
     return m_low + (m_length*Scalar(i))/m_divisor;
   }
 
-  template<typename Index>
+  template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Packet packetOp(Index i) const {
+  const Packet packetOp(IndexType i) const {
     return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)),
                                                      pset1<Packet>(m_divisor))); }
 
   const Scalar m_low;
   const Scalar m_length;
-  const Index  m_divisor;
+  const Scalar  m_divisor;
   const Packet m_interPacket;
 };
 
@@ -142,29 +142,11 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
     : impl((num_steps==1 ? high : low),high,num_steps)
   {}
 
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
 
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index, typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index, typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
+  template<typename Packet,typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
 
   // This proxy object handles the actual required temporaries, the different
   // implementations (random vs. sequential access) as well as the
@@ -174,11 +156,11 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
   const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl;
 };
 
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
+// and linear access is not possible. In all other cases, linear access is enabled.
+// Users should not have to deal with this struture.
+template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/functors/TernaryFunctors.h b/Eigen/src/Core/functors/TernaryFunctors.h
new file mode 100644
index 000000000..b254e96c6
--- /dev/null
+++ b/Eigen/src/Core/functors/TernaryFunctors.h
@@ -0,0 +1,25 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TERNARY_FUNCTORS_H
+#define EIGEN_TERNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative ternary functors ----------
+
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TERNARY_FUNCTORS_H
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 5baba1494..2009f8e57 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -248,7 +248,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
      // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
      : (14 * NumTraits<Scalar>::AddCost +
         6 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #else
     Cost =
     (sizeof(Scalar) == 4
@@ -257,7 +257,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
      // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
      : (23 * NumTraits<Scalar>::AddCost +
         12 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
 #endif
   };
 };
@@ -266,7 +266,7 @@ struct functor_traits<scalar_exp_op<Scalar> > {
   *
   * \brief Template functor to compute the logarithm of a scalar
   *
-  * \sa class CwiseUnaryOp, Cwise::log()
+  * \sa class CwiseUnaryOp, ArrayBase::log()
   */
 template<typename Scalar> struct scalar_log_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
@@ -293,6 +293,26 @@ struct functor_traits<scalar_log_op<Scalar> > {
   };
 };
 
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of 1 plus a scalar value
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::log1p()
+  */
+template<typename Scalar> struct scalar_log1p_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log1p_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog1p,
+    Cost = functor_traits<scalar_log_op<Scalar> >::Cost // TODO measure cost of log1p
+  };
+};
+
 /** \internal
   *
   * \brief Template functor to compute the base-10 logarithm of a scalar
@@ -452,142 +472,6 @@ struct functor_traits<scalar_asin_op<Scalar> >
 };
 
 
-/** \internal
- * \brief Template functor to compute the natural log of the absolute
- * value of Gamma of a scalar
- * \sa class CwiseUnaryOp, Cwise::lgamma()
- */
-template<typename Scalar> struct scalar_lgamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::lgamma; return lgamma(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_lgamma_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasLGamma
-  };
-};
-
-/** \internal
- * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
- * \sa class CwiseUnaryOp, Cwise::digamma()
- */
-template<typename Scalar> struct scalar_digamma_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::digamma; return digamma(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_digamma_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasDiGamma
-  };
-};
-    
-/** \internal
- * \brief Template functor to compute the Riemann Zeta function of two arguments.
- * \sa class CwiseUnaryOp, Cwise::zeta()
- */
-template<typename Scalar> struct scalar_zeta_op {
-    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
-        using numext::zeta; return zeta(x, q);
-    }
-    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_zeta_op<Scalar> >
-{
-    enum {
-        // Guesstimate
-        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-        PacketAccess = packet_traits<Scalar>::HasZeta
-    };
-};
-
-/** \internal
- * \brief Template functor to compute the polygamma function.
- * \sa class CwiseUnaryOp, Cwise::polygamma()
- */
-template<typename Scalar> struct scalar_polygamma_op {
-    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
-        using numext::polygamma; return polygamma(n, x);
-    }
-    typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_polygamma_op<Scalar> >
-{
-    enum {
-        // Guesstimate
-        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-        PacketAccess = packet_traits<Scalar>::HasPolygamma
-    };
-};
-
-/** \internal
- * \brief Template functor to compute the Gauss error function of a
- * scalar
- * \sa class CwiseUnaryOp, Cwise::erf()
- */
-template<typename Scalar> struct scalar_erf_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erf; return erf(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_erf_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErf
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Complementary Error Function
- * of a scalar
- * \sa class CwiseUnaryOp, Cwise::erfc()
- */
-template<typename Scalar> struct scalar_erfc_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erfc; return erfc(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_erfc_op<Scalar> >
-{
-  enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErfc
-  };
-};
-
-
 /** \internal
   * \brief Template functor to compute the atan of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::atan()
@@ -607,39 +491,40 @@ struct functor_traits<scalar_atan_op<Scalar> >
   };
 };
 
-
 /** \internal
   * \brief Template functor to compute the tanh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::tanh()
   */
-template<typename Scalar> struct scalar_tanh_op {
+template <typename Scalar>
+struct scalar_tanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
   template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_tanh_op<Scalar> >
-{
+
+template <typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar> > {
   enum {
     PacketAccess = packet_traits<Scalar>::HasTanh,
-    Cost =
-    (PacketAccess
-     // The following numbers are based on the AVX implementation,
+    Cost = ( (EIGEN_FAST_MATH && is_same<Scalar,float>::value)
+// The following numbers are based on the AVX implementation,
 #ifdef EIGEN_VECTORIZE_FMA
-     // Haswell can issue 2 add/mul/madd per cycle.
-     // 9 pmadd, 2 pmul, 1 div, 2 other
-     ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
-     NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 9 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost +
+                   6 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #else
-     ? (11 * NumTraits<Scalar>::AddCost +
-        11 * NumTraits<Scalar>::MulCost +
-        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+                ? (11 * NumTraits<Scalar>::AddCost +
+                   11 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
 #endif
-     // This number assumes a naive implementation of tanh
-     : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
-        2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost +
-        functor_traits<scalar_exp_op<Scalar> >::Cost))
+                // This number assumes a naive implementation of tanh
+                : (6 * NumTraits<Scalar>::AddCost +
+                   3 * NumTraits<Scalar>::MulCost +
+                   2 * scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value +
+                   functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
@@ -880,9 +765,9 @@ struct scalar_sign_op<Scalar,true> {
   {
     typedef typename NumTraits<Scalar>::Real real_type;
     real_type aa = numext::abs(a);
-    if (aa==0)
+    if (aa==real_type(0))
       return Scalar(0);
-    aa = 1./aa;
+    aa = real_type(1)/aa;
     return Scalar(real(a)*aa, imag(a)*aa );
   }
   //TODO
diff --git a/Eigen/src/Core/products/CMakeLists.txt b/Eigen/src/Core/products/CMakeLists.txt
deleted file mode 100644
index 21fc94ae3..000000000
--- a/Eigen/src/Core/products/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_Product_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Core_Product_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index a96c7bfd4..10d132957 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -299,16 +299,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   if (!useSpecificBlockingSizes(k, m, n)) {
     evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
   }
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  if (k > kr) k -= k % kr;
-  if (m > mr) m -= m % mr;
-  if (n > nr) n -= n % nr;
 }
 
 template<typename LhsScalar, typename RhsScalar, typename Index>
@@ -363,7 +353,7 @@ class gebp_traits
 public:
   typedef _LhsScalar LhsScalar;
   typedef _RhsScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
   enum {
     ConjLhs = _ConjLhs,
@@ -444,15 +434,16 @@ public:
   template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
   EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
   {
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
     // let gcc allocate the register in which to store the result of the pmul
     // (in the case where there is no FMA) gcc fails to figure out how to avoid
     // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
   }
 
@@ -467,9 +458,6 @@ public:
     r = pmadd(c,alpha,r);
   }
 
-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 
 template<typename RealScalar, bool _ConjLhs>
@@ -478,7 +466,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
 public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
   enum {
     ConjLhs = _ConjLhs,
@@ -860,80 +848,6 @@ protected:
   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 
-// helper for the rotating kernel below
-template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
-struct PossiblyRotatingKernelHelper
-{
-  // default implementation, not rotating
-
-  typedef typename GebpKernel::Traits Traits;
-  typedef typename Traits::RhsScalar RhsScalar;
-  typedef typename Traits::RhsPacket RhsPacket;
-  typedef typename Traits::AccPacket AccPacket;
-
-  const Traits& traits;
-  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
-
-
-  template <size_t K, size_t Index>
-  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
-  {
-    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
-  }
-
-  void unrotateResult(AccPacket&,
-                      AccPacket&,
-                      AccPacket&,
-                      AccPacket&)
-  {
-  }
-};
-
-// rotating implementation
-template <typename GebpKernel>
-struct PossiblyRotatingKernelHelper<GebpKernel, true>
-{
-  typedef typename GebpKernel::Traits Traits;
-  typedef typename Traits::RhsScalar RhsScalar;
-  typedef typename Traits::RhsPacket RhsPacket;
-  typedef typename Traits::AccPacket AccPacket;
-
-  const Traits& traits;
-  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
-
-  template <size_t K, size_t Index>
-  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
-  {
-    if (Index == 0) {
-      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
-    } else {
-      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
-      to = protate<1>(to);
-    }
-  }
-
-  void unrotateResult(AccPacket& res0,
-                      AccPacket& res1,
-                      AccPacket& res2,
-                      AccPacket& res3)
-  {
-    PacketBlock<AccPacket> resblock;
-    resblock.packet[0] = res0;
-    resblock.packet[1] = res1;
-    resblock.packet[2] = res2;
-    resblock.packet[3] = res3;
-    ptranspose(resblock);
-    resblock.packet[3] = protate<1>(resblock.packet[3]);
-    resblock.packet[2] = protate<2>(resblock.packet[2]);
-    resblock.packet[1] = protate<3>(resblock.packet[1]);
-    ptranspose(resblock);
-    res0 = resblock.packet[0];
-    res1 = resblock.packet[1];
-    res2 = resblock.packet[2];
-    res3 = resblock.packet[3];
-  }
-};
-
 /* optimized GEneral packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
@@ -967,16 +881,6 @@ struct gebp_kernel
     ResPacketSize = Traits::ResPacketSize
   };
 
-
-  static const bool UseRotatingKernel =
-    EIGEN_ARCH_ARM &&
-    internal::is_same<LhsScalar, float>::value &&
-    internal::is_same<RhsScalar, float>::value &&
-    internal::is_same<ResScalar, float>::value &&
-    Traits::LhsPacketSize == 4 &&
-    Traits::RhsPacketSize == 4 &&
-    Traits::ResPacketSize == 4;
-
   EIGEN_DONT_INLINE
   void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                   Index rows, Index depth, Index cols, ResScalar alpha,
@@ -1009,9 +913,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // This corresponds to 3*LhsProgress x nr register blocks.
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
-    {      
-      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
-      
+    {
       // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
       // and on each largest micro vertical panel of the rhs (depth * nr).
       // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
@@ -1074,19 +976,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
+              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
               traits.madd(A0, B_0, C0, T0); \
               traits.madd(A1, B_0, C4, T0); \
               traits.madd(A2, B_0, C8, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
+              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
               traits.madd(A0, B_0, C1, T0); \
               traits.madd(A1, B_0, C5, T0); \
               traits.madd(A2, B_0, C9, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
+              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
               traits.madd(A0, B_0, C2,  T0); \
               traits.madd(A1, B_0, C6,  T0); \
               traits.madd(A2, B_0, C10, B_0); \
-              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
+              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
               traits.madd(A0, B_0, C3 , T0); \
               traits.madd(A1, B_0, C7,  T0); \
               traits.madd(A2, B_0, C11, B_0); \
@@ -1120,10 +1022,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
 #undef EIGEN_GEBP_ONESTEP
 
-          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
-          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
-          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
-
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
@@ -1625,9 +1523,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-          // NOTE The following piece of code doesn't work for 512 bit registers,
-          // so we don't call it for registers that contain more than 8 values.
-          if( ((SwappedTraits::LhsProgress % 4)==0) && (SwappedTraits::LhsProgress <= 8))
+          // The following piece of code wont work for 512 bit registers
+          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
+          // as nr (which is currently 4) for the return type.
+          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          if ((SwappedTraits::LhsProgress % 4) == 0 &&
+              (SwappedTraits::LhsProgress <= 8) &&
+              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index a39c7808c..b1465c3b5 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -25,7 +25,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
 {
   typedef gebp_traits<RhsScalar,LhsScalar> Traits;
   
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols, Index depth,
     const LhsScalar* lhs, Index lhsStride,
@@ -55,7 +55,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
 
 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
   
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
@@ -309,8 +309,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
 #else
-      this->m_blockA = reinterpret_cast<LhsScalar*>((std::size_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
-      this->m_blockB = reinterpret_cast<RhsScalar*>((std::size_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
 #endif
     }
     
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 80ba89465..29d6dc721 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -40,7 +40,7 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                           typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
                                       const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
                                       const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
@@ -57,7 +57,7 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
                           typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
                                       const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
                                       const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 8b7dca45f..3c1a7fc40 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -58,7 +58,7 @@ namespace internal {
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -140,7 +140,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
@@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
     alignmentPattern = AllAligned;
   }
 
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
 
   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
@@ -334,7 +334,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -457,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
     alignmentPattern = AllAligned;
   }
 
-  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
 
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index c3e37b1e0..a45238d69 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -122,7 +122,7 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
       Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
       a_tmp = lhs.conjugate(); \
       a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
     } else a = _lhs; \
     if (LhsStorageOrder==RowMajor) uplo='U'; \
 \
@@ -256,7 +256,7 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
         b_tmp = lhs.transpose(); \
       } \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
     BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index d8d30267e..d97f8caa7 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
     
     eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
 
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index f79840aa7..4b292e74d 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -20,7 +20,7 @@ struct triangular_matrix_vector_product;
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -91,7 +91,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower = ((Mode&Lower)==Lower),
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
     
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 498db3a70..6e6ee119b 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -44,16 +44,29 @@ template<bool Conjugate> struct conj_if;
 
 template<> struct conj_if<true> {
   template<typename T>
-  inline T operator()(const T& x) { return numext::conj(x); }
+  inline T operator()(const T& x) const { return numext::conj(x); }
   template<typename T>
-  inline T pconj(const T& x) { return internal::pconj(x); }
+  inline T pconj(const T& x) const { return internal::pconj(x); }
 };
 
 template<> struct conj_if<false> {
   template<typename T>
-  inline const T& operator()(const T& x) { return x; }
+  inline const T& operator()(const T& x) const { return x; }
   template<typename T>
-  inline const T& pconj(const T& x) { return x; }
+  inline const T& pconj(const T& x) const { return x; }
+};
+
+// Generic implementation for custom complex types.
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs>
+struct conj_helper
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType Scalar;
+
+  EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return conj_if<ConjLhs>()(x) *  conj_if<ConjRhs>()(y); }
 };
 
 template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
@@ -111,7 +124,7 @@ template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::comp
 };
 
 template<typename From,typename To> struct get_factor {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
@@ -135,7 +148,7 @@ class BlasVectorMapper {
 
   template <typename Packet>
   EIGEN_DEVICE_FUNC bool aligned(Index i) const {
-    return (size_t(m_data+i)%sizeof(Packet))==0;
+    return (UIntPtr(m_data+i)%sizeof(Packet))==0;
   }
 
   protected:
@@ -227,7 +240,7 @@ class blas_data_mapper {
   EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
   EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
-    if (size_t(m_data)%sizeof(Scalar)) {
+    if (UIntPtr(m_data)%sizeof(Scalar)) {
       return -1;
     }
     return internal::first_default_aligned(m_data, size);
@@ -293,17 +306,33 @@ struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
 };
 
 // pop scalar multiple
-template<typename Scalar, typename NestedXpr>
-struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> >
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> XprType;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
   static inline Scalar extractScalarFactor(const XprType& x)
-  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+  { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
+};
+template<typename Scalar, typename Plain1, typename Plain2>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1>,
+                                                            const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain2> > >
+ : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1> >
+{};
 
 // pop opposite
 template<typename Scalar, typename NestedXpr>
diff --git a/Eigen/src/Core/util/CMakeLists.txt b/Eigen/src/Core/util/CMakeLists.txt
deleted file mode 100644
index a1e2e521f..000000000
--- a/Eigen/src/Core/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Core_util_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_Core_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/util COMPONENT Devel
-  )
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5f71ba3df..7587d6842 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -199,7 +199,7 @@ const unsigned int HereditaryBits = RowMajorBit
 /** \ingroup enums
   * Enum containing possible values for the \c Mode or \c UpLo parameter of
   * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
-enum {
+enum UpLoType {
   /** View matrix as a lower triangular matrix. */
   Lower=0x1,                      
   /** View matrix as an upper triangular matrix. */
@@ -224,7 +224,7 @@ enum {
 
 /** \ingroup enums
   * Enum for indicating whether a buffer is aligned or not. */
-enum {
+enum AlignmentType {
   Unaligned=0,        /**< Data pointer has no specific alignment. */
   Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
   Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
@@ -273,7 +273,7 @@ enum DirectionType {
 
 /** \internal \ingroup enums
   * Enum to specify how to traverse the entries of a matrix. */
-enum {
+enum TraversalType {
   /** \internal Default traversal, no vectorization, no index-based access */
   DefaultTraversal,
   /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
@@ -295,7 +295,7 @@ enum {
 
 /** \internal \ingroup enums
   * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
-enum {
+enum UnrollingType {
   /** \internal Do not unroll loops. */
   NoUnrolling,
   /** \internal Unroll only the inner loop, but not the outer loop. */
@@ -307,7 +307,7 @@ enum {
 
 /** \internal \ingroup enums
   * Enum to specify whether to use the default (built-in) implementation or the specialization. */
-enum {
+enum SpecializedType {
   Specialized,
   BuiltIn
 };
@@ -315,7 +315,7 @@ enum {
 /** \ingroup enums
   * Enum containing possible values for the \p _Options template parameter of
   * Matrix, Array and BandMatrix. */
-enum {
+enum StorageOptions {
   /** Storage order is column major (see \ref TopicStorageOrders). */
   ColMajor = 0,
   /** Storage order is row major (see \ref TopicStorageOrders). */
@@ -328,7 +328,7 @@ enum {
 
 /** \ingroup enums
   * Enum for specifying whether to apply or solve on the left or right. */
-enum {
+enum SideType {
   /** Apply transformation on the left. */
   OnTheLeft = 1,  
   /** Apply transformation on the right. */
@@ -353,7 +353,7 @@ enum Default_t    { Default };
 
 /** \internal \ingroup enums
   * Used in AmbiVector. */
-enum {
+enum AmbiVectorMode {
   IsDense         = 0,
   IsSparse
 };
@@ -479,8 +479,9 @@ namespace Architecture
 }
 
 /** \internal \ingroup enums
-  * Enum used as template parameter in Product and product evalautors. */
-enum { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+  * Enum used as template parameter in Product and product evaluators. */
+enum ProductImplType
+{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 
 /** \internal \ingroup enums
   * Enum used in experimental parallel implementation. */
@@ -492,7 +493,7 @@ struct Dense {};
 /** The type used to identify a general sparse storage. */
 struct Sparse {};
 
-/** The type used to identify a general solver (foctored) storage. */
+/** The type used to identify a general solver (factored) storage. */
 struct SolverStorage {};
 
 /** The type used to identify a permutation storage. */
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index cb27acff7..7559e129c 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -14,12 +14,13 @@
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
   // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
   // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -41,6 +42,14 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+
+#elif defined __GNUC__ && __GNUC__>=6
+
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma GCC diagnostic push
+  #endif
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+
 #endif
 
 #if defined __NVCC__
@@ -48,11 +57,19 @@
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
   #pragma diag_suppress initialization_not_reachable
-  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them)
+  // Disable the "invalid error number" message that we get with older versions of nvcc
+  #pragma diag_suppress 1222
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
+  #pragma diag_suppress 2527
+  #pragma diag_suppress 2529
   #pragma diag_suppress 2651
   #pragma diag_suppress 2653
   #pragma diag_suppress 2668
+  #pragma diag_suppress 2669
   #pragma diag_suppress 2670
+  #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index a102e5457..ea107393a 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -91,6 +91,7 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>  class CwiseTernaryOp;
 template<typename Decomposition, typename Rhstype>        class Solve;
 template<typename XprType>                                class Inverse;
 
@@ -174,9 +175,11 @@ namespace internal {
 // with optional conjugation of the arguments.
 template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
 
-template<typename Scalar> struct scalar_sum_op;
-template<typename Scalar> struct scalar_difference_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_min_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_opposite_op;
 template<typename Scalar> struct scalar_conjugate_op;
 template<typename Scalar> struct scalar_real_op;
@@ -192,27 +195,28 @@ template<typename Scalar> struct scalar_sin_op;
 template<typename Scalar> struct scalar_acos_op;
 template<typename Scalar> struct scalar_asin_op;
 template<typename Scalar> struct scalar_tan_op;
-template<typename Scalar> struct scalar_pow_op;
 template<typename Scalar> struct scalar_inverse_op;
 template<typename Scalar> struct scalar_square_op;
 template<typename Scalar> struct scalar_cube_op;
 template<typename Scalar, typename NewType> struct scalar_cast_op;
-template<typename Scalar> struct scalar_multiple_op;
-template<typename Scalar> struct scalar_quotient1_op;
-template<typename Scalar> struct scalar_min_op;
-template<typename Scalar> struct scalar_max_op;
 template<typename Scalar> struct scalar_random_op;
-template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
 template<typename Scalar,bool iscpx> struct scalar_sign_op;
+template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+
+// SpecialFunctions module
+template<typename Scalar> struct scalar_lgamma_op;
+template<typename Scalar> struct scalar_digamma_op;
+template<typename Scalar> struct scalar_erf_op;
+template<typename Scalar> struct scalar_erfc_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
-
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;
+template<typename Scalar> struct scalar_zeta_op;
+template<typename Scalar> struct scalar_betainc_op;
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
old mode 100644
new mode 100755
index 8c9239b1d..26b59669e
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@@ -49,7 +49,7 @@
   #define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML)
   #define EIGEN_USE_MKL
 #endif
 
@@ -72,7 +72,7 @@
 #endif
 
 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
+
 #define EIGEN_MKL_VML_THRESHOLD 128
 
 /* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index acb936ebe..9069d8e6b 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -13,7 +13,7 @@
 
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 92
+#define EIGEN_MINOR_VERSION 94
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -28,9 +28,9 @@
   #define EIGEN_COMP_GNUC 0
 #endif
 
-/// \internal EIGEN_COMP_CLANG set to 1 if the compiler is clang (alias for __clang__)
+/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
 #if defined(__clang__)
-  #define EIGEN_COMP_CLANG 1
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
 #else
   #define EIGEN_COMP_CLANG 0
 #endif
@@ -71,6 +71,15 @@
   #define EIGEN_COMP_MSVC 0
 #endif
 
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name  ver   MSC_VER
+//  2008    9      1500
+//  2010   10      1600
+//  2012   11      1700
+//  2013   12      1800
+//  2015   14      1900
+//  "15"   15      1900
+
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
   #define EIGEN_COMP_MSVC_STRICT _MSC_VER
@@ -340,50 +349,82 @@
 # define __has_feature(x) 0
 #endif
 
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
 // Do we support r-value references?
-#if (__has_feature(cxx_rvalue_references) || \
+#ifndef EIGEN_HAS_RVALUE_REFERENCES
+#if EIGEN_MAX_CPP_VER>=11 && \
+    (__has_feature(cxx_rvalue_references) || \
     (defined(__cplusplus) && __cplusplus >= 201103L) || \
     (EIGEN_COMP_MSVC >= 1600))
-  #define EIGEN_HAVE_RVALUE_REFERENCES
+  #define EIGEN_HAS_RVALUE_REFERENCES 1
+#else
+  #define EIGEN_HAS_RVALUE_REFERENCES 0
+#endif
 #endif
 
 // Does the compiler support C99?
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+#ifndef EIGEN_HAS_C99_MATH
+#if EIGEN_MAX_CPP_VER>=11 && \
+    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
   || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
-  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#define EIGEN_HAS_C99_MATH 1
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
+  #define EIGEN_HAS_C99_MATH 1
+#else
+  #define EIGEN_HAS_C99_MATH 0
+#endif
 #endif
 
 // Does the compiler support result_of?
-#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
+#ifndef EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
 #define EIGEN_HAS_STD_RESULT_OF 1
+#else
+#define EIGEN_HAS_STD_RESULT_OF 0
+#endif
 #endif
 
 // Does the compiler support variadic templates?
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
-// Disable the use of variadic templates when compiling with nvcc on ARM devices:
-// this prevents nvcc from crashing when compiling Eigen on Tegra X1
-#if !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64
+#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
+    && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 )
+    // ^^ Disable the use of variadic templates when compiling with nvcc on ARM devices:
+    //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#else
+#define EIGEN_HAS_VARIADIC_TEMPLATES 0
 #endif
 #endif
 
-// Does the compiler support const expressions?
+// Does the compiler fully support const expressions? (as in c++14)
+#ifndef EIGEN_HAS_CONSTEXPR
+
 #ifdef __CUDACC__
 // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)
+#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
   #define EIGEN_HAS_CONSTEXPR 1
 #endif
-#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))
+#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)))
 #define EIGEN_HAS_CONSTEXPR 1
 #endif
 
+#ifndef EIGEN_HAS_CONSTEXPR
+#define EIGEN_HAS_CONSTEXPR 0
+#endif
+
+#endif
+
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
 #ifndef EIGEN_HAS_CXX11_MATH
-  #if (__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
-      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
+  #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
     #define EIGEN_HAS_CXX11_MATH 1
   #else
     #define EIGEN_HAS_CXX11_MATH 0
@@ -392,9 +433,10 @@
 
 // Does the compiler support proper C++11 containers?
 #ifndef EIGEN_HAS_CXX11_CONTAINERS
-  #if    (__cplusplus > 201103L) \
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         ((__cplusplus > 201103L) \
       || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900
+      || EIGEN_COMP_MSVC >= 1900)
     #define EIGEN_HAS_CXX11_CONTAINERS 1
   #else
     #define EIGEN_HAS_CXX11_CONTAINERS 0
@@ -403,9 +445,11 @@
 
 // Does the compiler support C++11 noexcept?
 #ifndef EIGEN_HAS_CXX11_NOEXCEPT
-  #if    (__cplusplus > 201103L) \
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_noexcept) \
+      || (__cplusplus > 201103L) \
       || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900
+      || EIGEN_COMP_MSVC >= 1900)
     #define EIGEN_HAS_CXX11_NOEXCEPT 1
   #else
     #define EIGEN_HAS_CXX11_NOEXCEPT 0
@@ -427,6 +471,8 @@
 #define EIGEN_CAT2(a,b) a ## b
 #define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
 
+#define EIGEN_COMMA ,
+
 // convert a token to a string
 #define EIGEN_MAKESTRING2(a) #a
 #define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
@@ -725,6 +771,11 @@ namespace Eigen {
 #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
 #endif
 
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
 //----------------------------------------------------------------------
 
 
@@ -839,18 +890,10 @@ namespace Eigen {
 
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
 
-#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
-  template<typename OtherDerived> \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
-  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-  { \
-    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
-  }
-
-// the expression type of a cwise product
-#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
+// the expression type of a standard coefficient wise binary operation
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
     CwiseBinaryOp< \
-      internal::scalar_product_op< \
+      EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \
           typename internal::traits<LHS>::Scalar, \
           typename internal::traits<RHS>::Scalar \
       >, \
@@ -858,6 +901,55 @@ namespace Eigen {
       const RHS \
     >
 
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \
+  template<typename OtherDerived> \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \
+  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+  { \
+    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
+  }
+
+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
+
+#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
+
+#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
+
+// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
+#if EIGEN_COMP_MSVC_STRICT<=1600
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
+#else
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
+#endif
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC inline \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
+  (METHOD)(const T& scalar) const { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC inline friend \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
+  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
+
+
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
@@ -865,8 +957,8 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) catch (X)
 #else
 #  ifdef __CUDA_ARCH__
-#    define EIGEN_THROW_X(X) asm("trap;") return {}
-#    define EIGEN_THROW asm("trap;"); return {}
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@@ -875,10 +967,16 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) else
 #endif
 
+
 #if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_INCLUDE_TYPE_TRAITS
+#   define EIGEN_NOEXCEPT noexcept
+#   define EIGEN_NOEXCEPT_IF(x) noexcept(x)
 #   define EIGEN_NO_THROW noexcept(true)
 #   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
 #else
+#   define EIGEN_NOEXCEPT
+#   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
 #   define EIGEN_EXCEPTION_SPEC(X) throw(X)
 #endif
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 5f8bf15b2..0439655ca 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
     destruct_elements_of_array(ptr, i);
     EIGEN_THROW;
   }
+  return NULL;
 }
 
 /*****************************************************************************
@@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
     aligned_free(result);
     EIGEN_THROW;
   }
+  return result;
 }
 
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
@@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
     conditional_aligned_free<Align>(result);
     EIGEN_THROW;
   }
+  return result;
 }
 
 /** \internal Deletes objects constructed with aligned_new
@@ -445,7 +448,7 @@ EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
     // so that all elements of the array have the same alignment.
     return 0;
   }
-  else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
+  else if( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
   {
     // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
     // Consequently, no element of the array is well aligned.
@@ -453,7 +456,7 @@ EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
   }
   else
   {
-    Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
     return (first < size) ? first : size;
   }
 }
@@ -487,7 +490,7 @@ template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T*
 template<typename T> struct smart_copy_helper<T,true> {
   EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
   {
-    std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start);
+    IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
     memcpy(target, start, size);
@@ -510,7 +513,7 @@ template<typename T> void smart_memmove(const T* start, const T* end, T* target)
 template<typename T> struct smart_memmove_helper<T,true> {
   static inline void run(const T* start, const T* end, T* target)
   {
-    std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start);
+    IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
     std::memmove(target, start, size);
@@ -623,7 +626,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   #if EIGEN_DEFAULT_ALIGN_BYTES>0
     // We always manually re-align the result of EIGEN_ALLOCA.
     // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<std::size_t>(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
   #else
     #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
   #endif
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
old mode 100644
new mode 100755
index 24e8a6d8a..d4460bb77
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -16,8 +16,22 @@
 #include <math_constants.h>
 #endif
 
+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#include <cstdint>
+#endif
+
 namespace Eigen {
 
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
 namespace internal {
 
 /** \internal
@@ -27,6 +41,16 @@ namespace internal {
   * we however don't want to add a dependency to Boost.
   */
 
+// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
+// and older versions do not provide *intptr_t types.
+#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+typedef std::intptr_t  IntPtr;
+typedef std::uintptr_t UIntPtr;
+#else
+typedef std::ptrdiff_t IntPtr;
+typedef std::size_t UIntPtr;
+#endif
+
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
 
@@ -115,7 +139,14 @@ private:
 
 public:
   static From ms_from;
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
   enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
 };
 
 template<typename From, typename To>
@@ -128,7 +159,7 @@ struct is_convertible
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
-template<bool Condition, typename T> struct enable_if;
+template<bool Condition, typename T=void> struct enable_if;
 
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
@@ -254,7 +285,7 @@ protected:
   * upcoming next STL generation (using a templated result member).
   * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
   */
-#ifdef EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_HAS_STD_RESULT_OF
 template<typename T> struct result_of {
   typedef typename std::result_of<T>::type type1;
   typedef typename remove_all<type1>::type type;
@@ -311,8 +342,74 @@ struct result_of<Func(ArgType0,ArgType1)> {
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
 };
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
+struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
+};
 #endif
 
+struct meta_yes { char a[1]; };
+struct meta_no  { char a[2]; };
+
+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType
+{
+  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
+  template <typename C> static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+};
+
+template<typename T> const T& return_ref();
+
+template <typename T, typename IndexType=Index>
+struct has_nullary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()())>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_unary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_binary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ref<C>().operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
   * Usage example: \code meta_sqrt<1023>::ret \endcode
   */
@@ -358,33 +455,6 @@ template<typename T, typename U> struct scalar_product_traits
   enum { Defined = 0 };
 };
 
-template<typename T> struct scalar_product_traits<T,T>
-{
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<T,std::complex<T> >
-{
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
-};
-
-template<typename T> struct scalar_product_traits<std::complex<T>, T>
-{
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
-};
-
 // FIXME quick workaround around current limitation of result_of
 // template<typename Scalar, typename ArgType0, typename ArgType1>
 // struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index a23fab198..86b60f52f 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,17 +8,20 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
+  #elif defined __GNUC__ && __GNUC__>=6
+    #pragma GCC diagnostic pop
   #endif
 
   #if defined __NVCC__
 //    Don't reenable the diagnostic messages, as it turns out these messages need
 //    to be disabled at the point of the template instantiation (i.e the user code)
-//    otherwise they'll be triggeredby nvcc.
+//    otherwise they'll be triggered by nvcc.
 //    #pragma diag_default code_is_unreachable
 //    #pragma diag_default initialization_not_reachable
 //    #pragma diag_default 2651
 //    #pragma diag_default 2653
   #endif
+
 #endif
 
 #endif // EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index afae2e51e..4fd8891c6 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -26,7 +26,7 @@
 
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)
+  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600))
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
@@ -98,7 +98,9 @@
         EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
         THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
         MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
-        THIS_TYPE_IS_NOT_SUPPORTED
+        THIS_TYPE_IS_NOT_SUPPORTED,
+        STORAGE_KIND_MUST_MATCH,
+        STORAGE_INDEX_MUST_MATCH
       };
     };
 
@@ -165,7 +167,7 @@
 
 #define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
      ( \
-        (int(internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
+        (int(Eigen::internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(Eigen::internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
     || (\
           (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
         || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
@@ -192,16 +194,16 @@
                           THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
-      EIGEN_STATIC_ASSERT(internal::is_lvalue<Derived>::value, \
+      EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue<Derived>::value, \
                           THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
 
 #define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived>::XprKind, ArrayXpr>::value), \
                           THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
 
 #define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived1>::XprKind, \
-                                             typename internal::traits<Derived2>::XprKind \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived1>::XprKind, \
+                                             typename Eigen::internal::traits<Derived2>::XprKind \
                                             >::value), \
                           YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a001c473a..088a65240 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -24,16 +24,6 @@
 
 namespace Eigen {
 
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
-
-/**
- * \brief The Index type as used for the API.
- * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
- * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
- */
-
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
-
 namespace internal {
 
 template<typename IndexDest, typename IndexSrc>
@@ -45,6 +35,56 @@ inline IndexDest convert_index(const IndexSrc& idx) {
 }
 
 
+// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
+//    expression * scalar
+// Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
+// The IsSupported template parameter must be provided by the caller as: internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
+// Then the logic is as follows:
+//  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is returned.
+//  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
+//  - otherwise, ExprScalar is returned if T is implicitly convertible to ExprScalar AND that this does not imply a float to integer conversion.
+//  - In all other cases, the promoted type is not defined, and the respective operation is thus invalid and not available (SFINAE).
+template<typename ExprScalar,typename T, bool IsSupported>
+struct promote_scalar_arg;
+
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,true>
+{
+  typedef T type;
+};
+
+// Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
+template<typename ExprScalar,typename T,typename PromotedType,
+  bool ConvertibleToLiteral = internal::is_convertible<T,PromotedType>::value,
+  bool IsSafe = NumTraits<T>::IsInteger || !NumTraits<PromotedType>::IsInteger>
+struct promote_scalar_arg_unsupported;
+
+// Start recursion with NumTraits<ExprScalar>::Literal
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,false> : promote_scalar_arg_unsupported<S,T,typename NumTraits<S>::Literal> {};
+
+// We found a match!
+template<typename S,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,true,true>
+{
+  typedef PromotedType type;
+};
+
+// No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
+// so let's try to promote to ExprScalar
+template<typename ExprScalar,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<ExprScalar,T,PromotedType,false,true>
+   : promote_scalar_arg_unsupported<ExprScalar,T,ExprScalar>
+{};
+
+// Unsafe real-to-integer, let's stop.
+template<typename S,typename T, typename PromotedType, bool ConvertibleToLiteral>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,ConvertibleToLiteral,false> {};
+
+// T is not even convertible to ExprScalar, let's stop.
+template<typename S,typename T>
+struct promote_scalar_arg_unsupported<S,T,S,false,true> {};
+
 //classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator
 {
@@ -67,9 +107,9 @@ template<typename T, int Value> class variable_if_dynamic
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC void setValue(T) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
@@ -77,9 +117,9 @@ template<typename T> class variable_if_dynamic<T, Dynamic>
     T m_value;
     EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T value) : m_value(value) {}
-    EIGEN_DEVICE_FUNC T value() const { return m_value; }
-    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 /** \internal like variable_if_dynamic but for DynamicIndex
@@ -88,9 +128,9 @@ template<typename T, int Value> class variable_if_dynamicindex
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC void setValue(T) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
@@ -98,9 +138,9 @@ template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
     T m_value;
     EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T value) : m_value(value) {}
-    EIGEN_DEVICE_FUNC T value() const { return m_value; }
-    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 template<typename T> struct functor_traits
@@ -450,52 +490,6 @@ struct generic_xpr_base<Derived, XprKind, Dense>
   typedef typename dense_xpr_base<Derived,XprKind>::type type;
 };
 
-/** \internal Helper base class to add a scalar multiple operator
-  * overloads for complex types */
-template<typename Derived, typename Scalar, typename OtherScalar, typename BaseType,
-         bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public BaseType
-{
-  // dummy operator* so that the
-  // "using special_scalar_op_base::operator*" compiles
-  struct dummy {};
-  void operator*(dummy) const;
-  void operator/(dummy) const;
-};
-
-template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
-{
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
-  operator*(const OtherScalar& scalar) const
-  {
-#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-#endif
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
-      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
-  }
-
-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
-  operator*(const OtherScalar& scalar, const Derived& matrix)
-  {
-#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-#endif
-    return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
-  }
-  
-  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
-  operator/(const OtherScalar& scalar) const
-  {
-#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
-#endif
-    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
-      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
-  }
-};
-
 template<typename XprType, typename CastType> struct cast_return_type
 {
   typedef typename XprType::Scalar CurrentScalarType;
@@ -622,6 +616,20 @@ struct plain_diag_type
   >::type type;
 };
 
+template<typename Expr,typename Scalar = typename Expr::Scalar>
+struct plain_constant_type
+{
+  enum { Options = (traits<Expr>::Flags&RowMajorBit)?RowMajor:0 };
+
+  typedef Array<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> array_type;
+
+  typedef Matrix<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                 Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> matrix_type;
+
+  typedef CwiseNullaryOp<scalar_constant_op<Scalar>, const typename conditional<is_same< typename traits<Expr>::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type;
+};
+
 template<typename ExpressionType>
 struct is_lvalue
 {
@@ -656,10 +664,27 @@ bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_acces
   return false;
 }
 
-template<typename T, typename U> struct is_same_or_void { enum { value = is_same<T,U>::value }; };
-template<typename T> struct is_same_or_void<void,T>     { enum { value = 1 }; };
-template<typename T> struct is_same_or_void<T,void>     { enum { value = 1 }; };
-template<>           struct is_same_or_void<void,void>  { enum { value = 1 }; };
+// Internal helper defining the cost of a scalar division for the type T.
+// The default heuristic can be specialized for each scalar type and architecture.
+template<typename T,bool Vectorized=false,typename EnaleIf = void>
+struct scalar_div_cost {
+  enum { value = 8*NumTraits<T>::MulCost };
+};
+
+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+
+
+template<bool Vectorized>
+struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
+template<bool Vectorized>
+struct scalar_div_cost<unsigned long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 21 }; };
+
 
 #ifdef EIGEN_DEBUG_ASSIGN
 std::string demangle_traversal(int t)
@@ -695,17 +720,95 @@ std::string demangle_flags(int f)
 
 } // end namespace internal
 
-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between instances of \c U1 and \c U2 returns an \c U3.
+  * You can let %Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * By default, the following generic combinations are supported:
+  <table class="manual">
+  <tr><th>ScalarA</th><th>ScalarB</th><th>BinaryOp</th><th>ReturnType</th><th>Note</th></tr>
+  <tr            ><td>\c T </td><td>\c T </td><td>\c * </td><td>\c T </td><td></td></tr>
+  <tr class="alt"><td>\c NumTraits<T>::Real </td><td>\c T </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  <tr            ><td>\c T </td><td>\c NumTraits<T>::Real </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  </table>
+  *
+  * \sa CwiseBinaryOp
+  */
+template<typename ScalarA, typename ScalarB, typename BinaryOp=internal::scalar_product_op<ScalarA,ScalarB> >
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+  : internal::scalar_product_traits<ScalarA,ScalarB>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, BinaryOp>
+{
+  typedef T ReturnType;
+};
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, T, BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,void,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template<typename BinaryOp>
+struct ScalarBinaryOpTraits<void,void,BinaryOp>
+{
+  typedef void ReturnType;
+};
+
+// We require Lhs and Rhs to have "compatible" scalar types.
 // It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
 // So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
 // add together a float matrix and a double matrix.
 #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same_or_void<LHS, RHS>::value)), \
+  EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
     
 } // end namespace Eigen
diff --git a/Eigen/src/Eigenvalues/CMakeLists.txt b/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 193e02685..000000000
--- a/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_EIGENVALUES_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_EIGENVALUES_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigenvalues COMPONENT Devel
-  )
diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
similarity index 69%
rename from Eigen/src/Eigenvalues/ComplexSchur_MKL.h
rename to Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
index e20c3725b..4980a3ede 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
@@ -25,21 +25,19 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_COMPLEX_SCHUR_MKL_H
-#define EIGEN_COMPLEX_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H
+#define EIGEN_COMPLEX_SCHUR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SCHUR_COMPLEX(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
@@ -60,18 +58,18 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Eigen
       m_matUisUptodate = computeU; \
       return *this; \
   } \
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT1 select = 0; \
   jobvs = (computeU) ? 'V' : 'N'; \
   m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
   m_matT = matrix; \
-  lapack_int lda = m_matT.outerStride(); \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
   Matrix<EIGTYPE, Dynamic, Dynamic> w; \
   w.resize(n, 1);\
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)w.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
   if(info == 0) \
     m_info = Success; \
   else \
@@ -83,11 +81,11 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Eigen
 \
 }
 
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float,  c, C, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_SCHUR_MKL_H
+#endif // EIGEN_COMPLEX_SCHUR_LAPACKE_H
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 532ca7d63..f205b185d 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -324,11 +324,12 @@ template<typename MatrixType>
 MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivalues.rows();
   MatrixType matD = MatrixType::Zero(n,n);
   for (Index i=0; i<n; ++i)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i)), precision))
       matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
     else
     {
@@ -345,11 +346,12 @@ typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eige
 {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
   eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   Index n = m_eivec.cols();
   EigenvectorsType matV(n,n);
   for (Index j=0; j<n; ++j)
   {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j)), precision) || j+1==n)
     {
       // we have a real eigen value
       matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
@@ -451,26 +453,6 @@ EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool comput
   return *this;
 }
 
-// Complex scalar division.
-template<typename Scalar>
-std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
-{
-  using std::abs;
-  Scalar r,d;
-  if (abs(yr) > abs(yi))
-  {
-      r = yi/yr;
-      d = yr + r*yi;
-      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
-  }
-  else
-  {
-      r = yr/yi;
-      d = yi + r*yr;
-      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
-  }
-}
-
 
 template<typename MatrixType>
 void EigenSolver<MatrixType>::doComputeEigenvectors()
@@ -503,7 +485,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
       Scalar lastr(0), lastw(0);
       Index l = n;
 
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-1; i >= 0; i--)
       {
         Scalar w = m_matT.coeff(i,i) - p;
@@ -557,7 +539,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
       }
       else
       {
-        std::complex<Scalar> cc = cdiv<Scalar>(Scalar(0),-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        ComplexScalar cc = ComplexScalar(Scalar(0),-m_matT.coeff(n-1,n)) / ComplexScalar(m_matT.coeff(n-1,n-1)-p,q);
         m_matT.coeffRef(n-1,n-1) = numext::real(cc);
         m_matT.coeffRef(n-1,n) = numext::imag(cc);
       }
@@ -580,7 +562,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
           l = i;
           if (m_eivalues.coeff(i).imag() == RealScalar(0))
           {
-            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
+            ComplexScalar cc = ComplexScalar(-ra,-sa) / ComplexScalar(w,q);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
           }
@@ -594,7 +576,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
             if ((vr == Scalar(0)) && (vi == Scalar(0)))
               vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
 
-            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
+            ComplexScalar cc = ComplexScalar(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra) / ComplexScalar(vr,vi);
             m_matT.coeffRef(i,n-1) = numext::real(cc);
             m_matT.coeffRef(i,n) = numext::imag(cc);
             if (abs(x) > (abs(lastw) + abs(q)))
@@ -604,7 +586,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
             }
             else
             {
-              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
+              cc = ComplexScalar(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n)) / ComplexScalar(lastw,q);
               m_matT.coeffRef(i+1,n-1) = numext::real(cc);
               m_matT.coeffRef(i+1,n) = numext::imag(cc);
             }
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index a9d6790d5..36a91dffc 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Tobias Wood <tobias@spinicist.org.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -89,7 +90,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       */
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
 
-    /** \brief Type for vector of complex scalar values eigenvalues as returned by betas().
+    /** \brief Type for vector of complex scalar values eigenvalues as returned by alphas().
       *
       * This is a column vector with entries of type #ComplexScalar.
       * The length of the vector is the size of #MatrixType.
@@ -114,7 +115,14 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       *
       * \sa compute() for an example.
       */
-    GeneralizedEigenSolver() : m_eivec(), m_alphas(), m_betas(), m_isInitialized(false), m_realQZ(), m_matS(), m_tmp() {}
+    GeneralizedEigenSolver()
+      : m_eivec(),
+        m_alphas(),
+        m_betas(),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
+        m_realQZ()
+    {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -126,10 +134,9 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(size),
-        m_matS(size, size),
         m_tmp(size)
     {}
 
@@ -149,10 +156,9 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false),
+        m_valuesOkay(false),
+        m_vectorsOkay(false),
         m_realQZ(A.cols()),
-        m_matS(A.rows(), A.cols()),
         m_tmp(A.cols())
     {
       compute(A, B, computeEigenvectors);
@@ -160,22 +166,20 @@ template<typename _MatrixType> class GeneralizedEigenSolver
 
     /* \brief Returns the computed generalized eigenvectors.
       *
-      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+      * \returns  %Matrix whose columns are the (possibly complex) right eigenvectors.
+      * i.e. the eigenvectors that solve (A - l*B)x = 0. The ordering matches the eigenvalues.
       *
       * \pre Either the constructor 
       * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
       * compute(const MatrixType&, const MatrixType& bool) has been called before, and
       * \p computeEigenvectors was set to true (the default).
       *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
-      * matrix returned by this function is the matrix \f$ V \f$ in the
-      * generalized eigendecomposition \f$ A = B V D V^{-1} \f$, if it exists.
-      *
       * \sa eigenvalues()
       */
-//    EigenvectorsType eigenvectors() const;
+    EigenvectorsType eigenvectors() const {
+      eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated.");
+      return m_eivec;
+    }
 
     /** \brief Returns an expression of the computed generalized eigenvalues.
       *
@@ -197,7 +201,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       */
     EigenvalueType eigenvalues() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return EigenvalueType(m_alphas,m_betas);
     }
 
@@ -208,7 +212,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa betas(), eigenvalues() */
     ComplexVectorType alphas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_alphas;
     }
 
@@ -219,7 +223,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa alphas(), eigenvalues() */
     VectorType betas() const
     {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
       return m_betas;
     }
 
@@ -250,7 +254,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
 
     ComputationInfo info() const
     {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+      eigen_assert(m_valuesOkay && "EigenSolver is not initialized.");
       return m_realQZ.info();
     }
 
@@ -270,29 +274,14 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
     }
     
-    MatrixType m_eivec;
+    EigenvectorsType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
+    bool m_valuesOkay, m_vectorsOkay;
     RealQZ<MatrixType> m_realQZ;
-    MatrixType m_matS;
-
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-    ColumnVectorType m_tmp;
+    ComplexVectorType m_tmp;
 };
 
-//template<typename MatrixType>
-//typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType GeneralizedEigenSolver<MatrixType>::eigenvectors() const
-//{
-//  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-//  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-//  Index n = m_eivec.cols();
-//  EigenvectorsType matV(n,n);
-//  // TODO
-//  return matV;
-//}
-
 template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
@@ -302,46 +291,126 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
-
+  Index size = A.cols();
+  m_valuesOkay = false;
+  m_vectorsOkay = false;
   // Reduce to generalized real Schur form:
   // A = Q S Z and B = Q T Z
   m_realQZ.compute(A, B, computeEigenvectors);
-
   if (m_realQZ.info() == Success)
   {
-    m_matS = m_realQZ.matrixS();
+    // Resize storage
+    m_alphas.resize(size);
+    m_betas.resize(size);
     if (computeEigenvectors)
-      m_eivec = m_realQZ.matrixZ().transpose();
-  
-    // Compute eigenvalues from matS
-    m_alphas.resize(A.cols());
-    m_betas.resize(A.cols());
-    Index i = 0;
-    while (i < A.cols())
     {
-      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
+      m_eivec.resize(size,size);
+      m_tmp.resize(size);
+    }
+
+    // Aliases:
+    Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
+    ComplexVectorType &cv = m_tmp;
+    const MatrixType &mZ = m_realQZ.matrixZ();
+    const MatrixType &mS = m_realQZ.matrixS();
+    const MatrixType &mT = m_realQZ.matrixT();
+
+    Index i = 0;
+    while (i < size)
+    {
+      if (i == size - 1 || mS.coeff(i+1, i) == Scalar(0))
       {
-        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
-        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+        // Real eigenvalue
+        m_alphas.coeffRef(i) = mS.diagonal().coeff(i);
+        m_betas.coeffRef(i)  = mT.diagonal().coeff(i);
+        if (computeEigenvectors)
+        {
+          v.setConstant(Scalar(0.0));
+          v.coeffRef(i) = Scalar(1.0);
+          // For singular eigenvalues do nothing more
+          if(abs(m_betas.coeffRef(i)) >= (std::numeric_limits<RealScalar>::min)())
+          {
+            // Non-singular eigenvalue
+            const Scalar alpha = real(m_alphas.coeffRef(i));
+            const Scalar beta = m_betas.coeffRef(i);
+            for (Index j = i-1; j >= 0; j--)
+            {
+              const Index st = j+1;
+              const Index sz = i-j;
+              if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+              {
+                // 2x2 block
+                Matrix<Scalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( v.segment(st,sz) );
+                Matrix<Scalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+                v.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+                j--;
+              }
+              else
+              {
+                v.coeffRef(j) = -v.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum() / (beta*mS.coeffRef(j,j) - alpha*mT.coeffRef(j,j));
+              }
+            }
+          }
+          m_eivec.col(i).real().noalias() = mZ.transpose() * v;
+          m_eivec.col(i).real().normalize();
+          m_eivec.col(i).imag().setConstant(0);
+        }
         ++i;
       }
       else
       {
-        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T
+        // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00):
 
-        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
-        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
+        // T =  [a 0]
+        //      [0 b]
+        RealScalar a = mT.diagonal().coeff(i),
+                   b = mT.diagonal().coeff(i+1);
+        const RealScalar beta = m_betas.coeffRef(i) = m_betas.coeffRef(i+1) = a*b;
+
+        // ^^ NOTE: using diagonal()(i) instead of coeff(i,i) workarounds a MSVC bug.
+        Matrix<RealScalar,2,2> S2 = mS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();
+
+        Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
+        Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
+        const ComplexScalar alpha = ComplexScalar(S2.coeff(1,1) + p, (beta > 0) ? z : -z);
+        m_alphas.coeffRef(i)   = conj(alpha);
+        m_alphas.coeffRef(i+1) = alpha;
+
+        if (computeEigenvectors) {
+          // Compute eigenvector in position (i+1) and then position (i) is just the conjugate
+          cv.setZero();
+          cv.coeffRef(i+1) = Scalar(1.0);
+          // here, the "static_cast" workaound expression template issues.
+          cv.coeffRef(i) = -(static_cast<Scalar>(beta*mS.coeffRef(i,i+1)) - alpha*mT.coeffRef(i,i+1))
+                          / (static_cast<Scalar>(beta*mS.coeffRef(i,i))   - alpha*mT.coeffRef(i,i));
+          for (Index j = i-1; j >= 0; j--)
+          {
+            const Index st = j+1;
+            const Index sz = i+1-j;
+            if (j > 0 && mS.coeff(j, j-1) != Scalar(0))
+            {
+              // 2x2 block
+              Matrix<ComplexScalar, 2, 1> rhs = (alpha*mT.template block<2,Dynamic>(j-1,st,2,sz) - beta*mS.template block<2,Dynamic>(j-1,st,2,sz)) .lazyProduct( cv.segment(st,sz) );
+              Matrix<ComplexScalar, 2, 2> lhs = beta * mS.template block<2,2>(j-1,j-1) - alpha * mT.template block<2,2>(j-1,j-1);
+              cv.template segment<2>(j-1) = lhs.partialPivLu().solve(rhs);
+              j--;
+            } else {
+              cv.coeffRef(j) =  cv.segment(st,sz).transpose().cwiseProduct(beta*mS.block(j,st,1,sz) - alpha*mT.block(j,st,1,sz)).sum()
+                              / (alpha*mT.coeffRef(j,j) - static_cast<Scalar>(beta*mS.coeffRef(j,j)));
+            }
+          }
+          m_eivec.col(i+1).noalias() = (mZ.transpose() * cv);
+          m_eivec.col(i+1).normalize();
+          m_eivec.col(i) = m_eivec.col(i+1).conjugate();
+        }
         i += 2;
       }
     }
+
+    m_valuesOkay = true;
+    m_vectorsOkay = computeEigenvectors;
   }
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = false;//computeEigenvectors;
-
   return *this;
 }
 
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index a62071d42..b3a910dd9 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -552,7 +552,6 @@ namespace Eigen {
       m_T.coeffRef(l,l-1) = Scalar(0.0);
     }
 
-
   template<typename MatrixType>
     RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
     {
@@ -616,6 +615,37 @@ namespace Eigen {
       }
       // check if we converged before reaching iterations limit
       m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
+
+      // For each non triangular 2x2 diagonal block of S,
+      //    reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
+      // This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
+      // and is in par with Lapack/Matlab QZ.
+      if(m_info==Success)
+      {
+        for(Index i=0; i<dim-1; ++i)
+        {
+          if(m_S.coeff(i+1, i) != Scalar(0))
+          {
+            JacobiRotation<Scalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
+
+            // Apply resulting Jacobi rotations
+            m_S.applyOnTheLeft(i,i+1,j_left);
+            m_S.applyOnTheRight(i,i+1,j_right);
+            m_T.applyOnTheLeft(i,i+1,j_left);
+            m_T.applyOnTheRight(i,i+1,j_right);
+            m_T(i+1,i) = m_T(i,i+1) = Scalar(0);
+
+            if(m_computeQZ) {
+              m_Q.applyOnTheRight(i,i+1,j_left.transpose());
+              m_Z.applyOnTheLeft(i,i+1,j_right.transpose());
+            }
+
+            i++;
+          }
+        }
+      }
+
       return *this;
     } // end compute
 
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index f4ded69b6..d6a339f07 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -253,19 +253,25 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrix.rows();
 
+  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+
   // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix.derived());
+  m_hess.compute(matrix.derived()/scale);
 
   // Step 2. Reduce to real Schur form  
   computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+
+  m_matT *= scale;
   
   return *this;
 }
 template<typename MatrixType>
 template<typename HessMatrixType, typename OrthMatrixType>
 RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
-{  
-  m_matT = matrixH; 
+{
+  using std::abs;
+
+  m_matT = matrixH;
   if(computeU)
     m_matU = matrixQ;
   
diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
similarity index 67%
rename from Eigen/src/Eigenvalues/RealSchur_MKL.h
rename to Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
index e80926400..2c2251715 100644
--- a/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
@@ -25,39 +25,37 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_REAL_SCHUR_MKL_H
-#define EIGEN_REAL_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_REAL_SCHUR_LAPACKE_H
+#define EIGEN_REAL_SCHUR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SCHUR_REAL(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
   eigen_assert(matrix.cols() == matrix.rows()); \
 \
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
+  LAPACK_##LAPACKE_PREFIX_U##_SELECT2 select = 0; \
   jobvs = (computeU) ? 'V' : 'N'; \
   m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
+  lapack_int ldvs  = internal::convert_index<lapack_int>(m_matU.outerStride()); \
   m_matT = matrix; \
-  lapack_int lda = m_matT.outerStride(); \
+  lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride()); \
   Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
   wr.resize(n, 1); wi.resize(n, 1); \
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
+  info = LAPACKE_##LAPACKE_PREFIX##gees( matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, &sdim, (LAPACKE_TYPE*)wr.data(), (LAPACKE_TYPE*)wi.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs ); \
   if(info == 0) \
     m_info = Success; \
   else \
@@ -69,11 +67,11 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBas
 \
 }
 
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_REAL_SCHUR_MKL_H
+#endif // EIGEN_REAL_SCHUR_LAPACKE_H
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 469ea5e4e..a9f56c4f5 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -414,7 +414,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix(0,0));
+    m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]);
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
@@ -458,7 +458,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   {
     m_eivec.setIdentity(diag.size(), diag.size());
   }
-  m_info = computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
 
   m_isInitialized = true;
   m_eigenvectorsOk = computeEigenvectors;
@@ -492,15 +492,16 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
   
   typedef typename DiagType::RealScalar RealScalar;
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
   
   while (end>0)
   {
     for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
+      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
         subdiag[i] = 0;
 
     // find the largest unreduced block
-    while (end>0 && subdiag[end-1]==0)
+    while (end>0 && subdiag[end-1]==RealScalar(0))
     {
       end--;
     }
@@ -568,8 +569,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     EIGEN_USING_STD_MATH(atan2)
     EIGEN_USING_STD_MATH(cos)
     EIGEN_USING_STD_MATH(sin)
-    const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
-    const Scalar s_sqrt3 = sqrt(Scalar(3.0));
+    const Scalar s_inv3 = Scalar(1)/Scalar(3);
+    const Scalar s_sqrt3 = sqrt(Scalar(3));
 
     // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
     // eigenvalues are the roots to this equation, all guaranteed to be
@@ -739,14 +740,18 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
     EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = numext::maxi(scale,Scalar(1));
-    MatrixType scaledMat = mat / scale;
-    
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(2);
+    MatrixType scaledMat = mat;
+    scaledMat.coeffRef(0,1) = mat.coeff(1,0);
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > Scalar(0))
+      scaledMat /= scale;
+
     // Compute the eigenvalues
     computeRoots(scaledMat,eivals);
-    
+
     // compute the eigen vectors
     if(computeEigenvectors)
     {
@@ -774,10 +779,11 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
         eivecs.col(0) << eivecs.col(1).unitOrthogonal();
       }
     }
-    
+
     // Rescale back to the original size.
     eivals *= scale;
-    
+    eivals.array() += shift;
+
     solver.m_info = Success;
     solver.m_isInitialized = true;
     solver.m_eigenvectorsOk = computeEigenvectors;
@@ -809,14 +815,14 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta
 //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==0)
+  if(td==RealScalar(0))
     mu -= abs(e);
   else
   {
     RealScalar e2 = numext::abs2(subdiag[end-1]);
     RealScalar h = numext::hypot(td,e);
-    if(e2==0)  mu -= (e / (td + (td>0 ? 1 : -1))) * (e / h);
-    else       mu -= e2 / (td + (td>0 ? h : -h));
+    if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h);
+    else                  mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
   }
   
   RealScalar x = diag[start] - mu;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
similarity index 66%
rename from Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
rename to Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
index 3499dc78a..3891cf883 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@@ -25,21 +25,19 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Self-adjoint eigenvalues/eigenvectors.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SAEIGENSOLVER_MKL_H
-#define EIGEN_SAEIGENSOLVER_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H
+#define EIGEN_SAEIGENSOLVER_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \
 template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
@@ -49,7 +47,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
           && (options&EigVecMask)!=EigVecMask \
           && "invalid option parameter"); \
   bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
-  lapack_int n = matrix.cols(), lda, matrix_order, info; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, matrix_order, info; \
   m_eivalues.resize(n,1); \
   m_subdiag.resize(n-1); \
   m_eivec = matrix; \
@@ -64,12 +62,12 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
     return *this; \
   } \
 \
-  lda = m_eivec.outerStride(); \
-  matrix_order=MKLCOLROW; \
+  lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
+  matrix_order=LAPACKE_COLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
 \
-  info = LAPACKE_##MKLNAME( matrix_order, jobz, uplo, n, (MKLTYPE*)m_eivec.data(), lda, (MKLRTYPE*)m_eivalues.data() ); \
+  info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
   m_info = (info==0) ? Success : NoConvergence; \
   m_isInitialized = true; \
   m_eigenvectorsOk = computeEigenvectors; \
@@ -77,15 +75,15 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
 }
 
 
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 2030b5be1..1d102c17b 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -367,10 +367,10 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
     hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
                                   * (conj(h) * matA.col(i).tail(remainingSize)));
 
-    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+    hCoeffs.tail(n-i-1) += (conj(h)*RealScalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
 
     matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
-      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
 
     matA.col(i).coeffRef(i+1) = beta;
     hCoeffs.coeffRef(i) = h;
diff --git a/Eigen/src/Geometry/AlignedBox.h b/Eigen/src/Geometry/AlignedBox.h
index 03f1a11f8..d20d17492 100644
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h
@@ -36,8 +36,9 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   typedef NumTraits<Scalar>                         ScalarTraits;
   typedef Eigen::Index                              Index; ///< \deprecated since Eigen 3.3
   typedef typename ScalarTraits::Real               RealScalar;
-  typedef typename ScalarTraits::NonInteger      NonInteger;
+  typedef typename ScalarTraits::NonInteger         NonInteger;
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> VectorTypeSum;
 
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
   enum CornerType
@@ -111,16 +112,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   inline VectorType& (max)() { return m_max; }
 
   /** \returns the center of the box */
-  inline const CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>,
-                            const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> >
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(VectorTypeSum, RealScalar, quotient)
   center() const
-  { return (m_min+m_max)/2; }
+  { return (m_min+m_max)/RealScalar(2); }
 
   /** \returns the lengths of the sides of the bounding box.
     * Note that this function does not get the same
     * result for integral or floating scalar types: see
     */
-  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> sizes() const
+  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> sizes() const
   { return m_max - m_min; }
 
   /** \returns the volume of the bounding box */
@@ -131,7 +131,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     * if the length of the diagonal is needed: diagonal().norm()
     * will provide it.
     */
-  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> diagonal() const
+  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar,Scalar>, const VectorType, const VectorType> diagonal() const
   { return sizes(); }
 
   /** \returns the vertex of the bounding box at the corner defined by
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 7fdb8ae83..571062d00 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The resulting axis is normalized.
+  *
+  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
   * 
   * This function implicitly normalizes the quaternion \a q.
   */
@@ -167,12 +168,16 @@ template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
   using std::atan2;
+  using std::abs;
   Scalar n = q.vec().norm();
   if(n<NumTraits<Scalar>::epsilon())
     n = q.vec().stableNorm();
-  if (n > Scalar(0))
+
+  if (n != Scalar(0))
   {
-    m_angle = Scalar(2)*atan2(n, q.w());
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
+    if(q.w() < 0)
+      n = -n;
     m_axis  = q.vec() / n;
   }
   else
diff --git a/Eigen/src/Geometry/CMakeLists.txt b/Eigen/src/Geometry/CMakeLists.txt
deleted file mode 100644
index f8f728b84..000000000
--- a/Eigen/src/Geometry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_Geometry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Geometry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(arch)
diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index b875b7a13..4865e58aa 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,i), coeff(k,i));
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
     {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
       res[1] = -atan2(s2, coeff(i,i));
     }
@@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,k), coeff(k,k));
     Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       res[1] = atan2(-coeff(i,k), -c2);
     }
     else
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index cd52b5470..a23068c8d 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -329,10 +329,10 @@ protected:
 
 // dense = homogeneous
 template< typename DstXprType, typename ArgType, typename Scalar>
-struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
 {
   typedef Homogeneous<ArgType,Vertical> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
   {
     dst.template topRows<ArgType::RowsAtCompileTime>(src.nestedExpression().rows()) = src.nestedExpression();
     dst.row(dst.rows()-1).setOnes();
@@ -341,10 +341,10 @@ struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op
 
 // dense = homogeneous
 template< typename DstXprType, typename ArgType, typename Scalar>
-struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar,typename ArgType::Scalar>, Dense2Dense>
 {
   typedef Homogeneous<ArgType,Horizontal> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename ArgType::Scalar> &)
   {
     dst.template leftCols<ArgType::ColsAtCompileTime>(src.nestedExpression().cols()) = src.nestedExpression();
     dst.col(dst.cols()-1).setOnes();
@@ -373,7 +373,7 @@ struct homogeneous_right_product_refactoring_helper
   typedef typename Rhs::ConstRowXpr                                     ConstantColumn;
   typedef Replicate<const ConstantColumn,Rows,1>                        ConstantBlock;
   typedef Product<Lhs,LinearBlock,LazyProduct>                          LinearProduct;
-  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
 };
 
 template<typename Lhs, typename Rhs, int ProductTag>
@@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
   }
 };
 
+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
 template<typename Lhs,typename Rhs>
 struct homogeneous_left_product_refactoring_helper
 {
@@ -414,7 +426,7 @@ struct homogeneous_left_product_refactoring_helper
   typedef typename Lhs::ConstColXpr                                     ConstantColumn;
   typedef Replicate<const ConstantColumn,1,Cols>                        ConstantBlock;
   typedef Product<LinearBlock,Rhs,LazyProduct>                          LinearProduct;
-  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar,typename Rhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
 };
 
 template<typename Lhs, typename Rhs, int ProductTag>
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 32d1499c6..c4a0eabb5 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -271,6 +271,8 @@ public:
   explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+  static Quaternion UnitRandom();
+
   template<typename Derived1, typename Derived2>
   static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
@@ -609,6 +611,24 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   return derived();
 }
 
+/** \returns a random unit quaternion following a uniform distribution law on SO(3)
+  *
+  * \note The implementation is based on http://planning.cs.uiuc.edu/node198.html
+  */
+template<typename Scalar, int Options>
+Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRandom()
+{
+  using std::sqrt;
+  using std::sin;
+  using std::cos;
+  const Scalar u1 = internal::random<Scalar>(0, 1),
+               u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
+               u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
+  const Scalar a = sqrt(1 - u1),
+               b = sqrt(u1);
+  return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
+}
+
 
 /** Returns a quaternion representing a rotation between
   * the two arbitrary vectors \a a and \a b. In other words, the built
@@ -706,7 +726,7 @@ QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerive
   using std::acos;
   using std::sin;
   using std::abs;
-  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+  const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
   Scalar d = this->dot(other);
   Scalar absD = abs(d);
 
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 5ab0d5920..b42a7df70 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -82,15 +82,15 @@ public:
   
   /** \returns the rotation angle in [0,2pi] */
   inline Scalar smallestPositiveAngle() const {
-    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
-    return tmp<Scalar(0) ? tmp + Scalar(2)*EIGEN_PI : tmp;
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    return tmp<Scalar(0) ? tmp + Scalar(2*EIGEN_PI) : tmp;
   }
   
   /** \returns the rotation angle in [-pi,pi] */
   inline Scalar smallestAngle() const {
-    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
-    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2)*Scalar(EIGEN_PI);
-    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2)*Scalar(EIGEN_PI);
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2*EIGEN_PI);
+    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI);
     return tmp;
   }
 
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index 643138199..3e12681b0 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -107,12 +107,15 @@ public:
 /** \addtogroup Geometry_Module */
 //@{
 
-/** Concatenates a linear transformation matrix and a uniform scaling */
+/** Concatenates a linear transformation matrix and a uniform scaling
+  * \relates UniformScaling
+  */
 // NOTE this operator is defiend in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
-template<typename Derived> typename MatrixBase<Derived>::ScalarMultipleReturnType
-MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
-{ return derived() * s.factor(); }
+template<typename Derived,typename Scalar>
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
+operator*(const MatrixBase<Derived>& matrix, const UniformScaling<Scalar>& s)
+{ return matrix.derived() * s.factor(); }
 
 /** Constructs a uniform scaling from scale factor \a s */
 static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 75f20bda6..8f6c62d63 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -32,7 +32,8 @@ template< typename TransformType,
           typename MatrixType,
           int Case = transform_traits<TransformType>::IsProjective ? 0
                    : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
-                   : 2>
+                   : 2,
+          int RhsCols = MatrixType::ColsAtCompileTime>
 struct transform_right_product_impl;
 
 template< typename Other,
@@ -192,7 +193,7 @@ template<int Mode> struct transform_make_affine;
   * preprocessor token EIGEN_QT_SUPPORT is defined.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
   *
   * \sa class Matrix, class Quaternion
   */
@@ -436,7 +437,7 @@ public:
     */
   // note: this function is defined here because some compilers cannot find the respective declaration
   template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename OtherDerived::PlainObject
+  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
   operator * (const EigenBase<OtherDerived> &other) const
   { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
 
@@ -463,7 +464,7 @@ public:
     operator * (const DiagonalBase<DiagonalDerived> &b) const
   {
     TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
     return res;
   }
 
@@ -577,7 +578,7 @@ public:
     return res;
   }
 
-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
 
   template<typename Derived>
   inline Transform& operator=(const RotationBase<Derived,Dim>& r);
@@ -852,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  affine().noalias() = (other.asDiagonal() * affine());
   return *this;
 }
 
@@ -1072,7 +1073,7 @@ void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixTy
   }
 }
 
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
+/** decomposes the linear part of the transformation as a product scaling x rotation, the scaling being
   * not necessarily positive.
   *
   * If either pointer is zero, the corresponding computation is skipped.
@@ -1287,8 +1288,8 @@ struct transform_product_result
   };
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 0 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols>
 {
   typedef typename MatrixType::PlainObject ResultType;
 
@@ -1298,8 +1299,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 0 >
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 1 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 {
   enum { 
     Dim = TransformType::Dim, 
@@ -1324,8 +1325,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 1 >
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 2 >
+template< typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 {
   enum { 
     Dim = TransformType::Dim, 
@@ -1348,6 +1349,30 @@ struct transform_right_product_impl< TransformType, MatrixType, 2 >
   }
 };
 
+template< typename TransformType, typename MatrixType >
+struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is a vector of size Dim
+{
+  typedef typename TransformType::MatrixType TransformMatrix;
+  enum {
+    Dim = TransformType::Dim,
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    WorkingRows = EIGEN_PLAIN_ENUM_MIN(TransformMatrix::RowsAtCompileTime,HDim)
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  {
+    EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
+    rhs.template head<Dim>() = other; rhs[Dim] = typename ResultType::Scalar(1);
+    Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
+    return res.template head<Dim>();
+  }
+};
+
 /**********************************************************
 ***   Specializations of operator* with lhs EigenBase   ***
 **********************************************************/
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 82d7777f0..b9b9a590c 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -130,8 +130,10 @@ public:
   }
 
   /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
+  template<typename Derived>
+  inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
+  operator* (const MatrixBase<Derived>& vec) const
+  { return m_coeffs + vec.derived(); }
 
   /** \returns the inverse translation (opposite) */
   Translation inverse() const { return Translation(-m_coeffs); }
diff --git a/Eigen/src/Geometry/arch/CMakeLists.txt b/Eigen/src/Geometry/arch/CMakeLists.txt
deleted file mode 100644
index 1267a79c7..000000000
--- a/Eigen/src/Geometry/arch/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Geometry_arch_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Geometry_arch_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Geometry/arch COMPONENT Devel
-  )
diff --git a/Eigen/src/Householder/CMakeLists.txt b/Eigen/src/Householder/CMakeLists.txt
deleted file mode 100644
index ce4937db0..000000000
--- a/Eigen/src/Householder/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Householder_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Householder_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Householder COMPONENT Devel
-  )
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 4c1f499a1..80de2c305 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index e9f3ebf88..3ce0a693d 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -108,7 +108,7 @@ struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
 
 template<typename OtherScalarType, typename MatrixType> struct matrix_type_times_scalar_type
 {
-  typedef typename scalar_product_traits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
+  typedef typename ScalarBinaryOpTraits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
     ResultScalar;
   typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
                  0, MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime> Type;
@@ -243,7 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       workspace.resize(rows());
       Index vecs = m_length;
-      if(is_same_dense(dst,m_vectors))
+      if(internal::is_same_dense(dst,m_vectors))
       {
         // in-place
         dst.diagonal().setOnes();
@@ -304,7 +304,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     /** \internal */
     template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
     {
-      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace(dst.cols());
+      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
       applyThisOnTheLeft(dst, workspace);
     }
 
diff --git a/Eigen/src/IterativeLinearSolvers/CMakeLists.txt b/Eigen/src/IterativeLinearSolvers/CMakeLists.txt
deleted file mode 100644
index 59ccc0072..000000000
--- a/Eigen/src/IterativeLinearSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeLinearSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeLinearSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/IterativeLinearSolvers COMPONENT Devel
-  )
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 35923be3d..0498db396 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -44,6 +44,7 @@ public:
   typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
   typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
   typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
+  typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
   
   SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
     : m_dec(dec), m_rhs(rhs), m_guess(guess)
@@ -81,7 +82,8 @@ struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
     : m_result(solve.rows(), solve.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
-    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result, solve().guess());
+    m_result = solve.guess();
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
   }
   
 protected:  
@@ -91,10 +93,10 @@ protected:
 // Specialization for "dst = dec.solveWithGuess(rhs)"
 // NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
 template<typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
-struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef SolveWithGuess<DecType,RhsType,GuessType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     // FIXME shall we resize dst here?
     dst = src.guess();
diff --git a/Eigen/src/Jacobi/CMakeLists.txt b/Eigen/src/Jacobi/CMakeLists.txt
deleted file mode 100644
index 490dac626..000000000
--- a/Eigen/src/Jacobi/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Jacobi_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Jacobi_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Jacobi COMPONENT Devel
-  )
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 55de15e87..d25af8e90 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -85,7 +85,8 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   using std::sqrt;
   using std::abs;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  if(y == Scalar(0))
+  RealScalar deno = RealScalar(2)*abs(y);
+  if(deno < (std::numeric_limits<RealScalar>::min)())
   {
     m_c = Scalar(1);
     m_s = Scalar(0);
@@ -93,7 +94,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   }
   else
   {
-    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+    RealScalar tau = (x-z)/deno;
     RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
     RealScalar t;
     if(tau>RealScalar(0))
diff --git a/Eigen/src/LU/CMakeLists.txt b/Eigen/src/LU/CMakeLists.txt
deleted file mode 100644
index e0d8d78c1..000000000
--- a/Eigen/src/LU/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_LU_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_LU_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(arch)
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 64b9eb7f1..03b6af706 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -52,6 +52,8 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
@@ -97,6 +99,15 @@ template<typename _MatrixType> class FullPivLU
     template<typename InputType>
     explicit FullPivLU(const EigenBase<InputType>& matrix);
 
+    /** \brief Constructs a LU factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivLU(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivLU(EigenBase<InputType>& matrix);
+
     /** Computes the LU decomposition of the given matrix.
       *
       * \param matrix the matrix of which to compute the LU decomposition.
@@ -105,7 +116,11 @@ template<typename _MatrixType> class FullPivLU
       * \returns a reference to *this
       */
     template<typename InputType>
-    FullPivLU& compute(const EigenBase<InputType>& matrix);
+    FullPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      computeInPlace();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -141,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa permutationQ()
       */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       return m_p;
@@ -391,8 +406,8 @@ template<typename _MatrixType> class FullPivLU
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
@@ -418,9 +433,10 @@ template<typename _MatrixType> class FullPivLU
     PermutationQType m_q;
     IntColVectorType m_rowsTranspositions;
     IntRowVectorType m_colsTranspositions;
-    Index m_det_pq, m_nonzero_pivots;
+    Index m_nonzero_pivots;
     RealScalar m_l1_norm;
     RealScalar m_maxpivot, m_prescribedThreshold;
+    signed char m_det_pq;
     bool m_isInitialized, m_usePrescribedThreshold;
 };
 
@@ -458,25 +474,28 @@ FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix)
 
 template<typename MatrixType>
 template<typename InputType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
+FullPivLU<MatrixType>::FullPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_q(matrix.cols()),
+    m_rowsTranspositions(matrix.rows()),
+    m_colsTranspositions(matrix.cols()),
+    m_isInitialized(false),
+    m_usePrescribedThreshold(false)
 {
-  check_template_parameters();
-
-  // the permutations are stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-
-  m_lu = matrix.derived();
-  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
-
   computeInPlace();
-
-  m_isInitialized = true;
-  return *this;
 }
 
 template<typename MatrixType>
 void FullPivLU<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
+  // the permutations are stored as int indices, so just to be sure:
+  eigen_assert(m_lu.rows()<=NumTraits<int>::highest() && m_lu.cols()<=NumTraits<int>::highest());
+
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+
   const Index size = m_lu.diagonalSize();
   const Index rows = m_lu.rows();
   const Index cols = m_lu.cols();
@@ -556,6 +575,8 @@ void FullPivLU<MatrixType>::computeInPlace()
     m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+
+  m_isInitialized = true;
 }
 
 template<typename MatrixType>
@@ -838,12 +859,12 @@ namespace internal {
 
 
 /***** Implementation of inverse() *****************************************************/
-template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename FullPivLU<MatrixType>::Scalar>, Dense2Dense>
 {
   typedef FullPivLU<MatrixType> LuType;
   typedef Inverse<LuType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename MatrixType::Scalar> &)
   {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
@@ -858,14 +879,12 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
   *
   * \sa class FullPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::fullPivLu() const
 {
   return FullPivLU<PlainObject>(eval());
 }
-#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index e202a55cb..3134632e1 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -286,11 +286,11 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 namespace internal {
 
 // Specialization for "dense = dense_xpr.inverse()"
-template<typename DstXprType, typename XprType, typename Scalar>
-struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+template<typename DstXprType, typename XprType>
+struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     // FIXME shall we resize dst here?
     const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 2e6d91939..d43961887 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -26,6 +26,17 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
   };
 };
 
+template<typename T,typename Derived>
+struct enable_if_ref;
+// {
+//   typedef Derived type;
+// };
+
+template<typename T,typename Derived>
+struct enable_if_ref<Ref<T>,Derived> {
+  typedef Derived type;
+};
+
 } // end namespace internal
 
 /** \ingroup LU_Module
@@ -57,6 +68,8 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
   *
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
@@ -102,8 +115,22 @@ template<typename _MatrixType> class PartialPivLU
     template<typename InputType>
     explicit PartialPivLU(const EigenBase<InputType>& matrix);
 
+    /** Constructor for \link InplaceDecomposition inplace decomposition \endlink
+      *
+      * \param matrix the matrix of which to compute the LU decomposition.
+      *
+      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+      * If you need to deal with non-full rank, use class FullPivLU instead.
+      */
     template<typename InputType>
-    PartialPivLU& compute(const EigenBase<InputType>& matrix);
+    explicit PartialPivLU(EigenBase<InputType>& matrix);
+
+    template<typename InputType>
+    PartialPivLU& compute(const EigenBase<InputType>& matrix) {
+      m_lu = matrix.derived();
+      compute();
+      return *this;
+    }
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -251,11 +278,13 @@ template<typename _MatrixType> class PartialPivLU
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
+    void compute();
+
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
-    Index m_det_p;
     RealScalar m_l1_norm;
+    signed char m_det_p;
     bool m_isInitialized;
 };
 
@@ -264,8 +293,8 @@ PartialPivLU<MatrixType>::PartialPivLU()
   : m_lu(),
     m_p(),
     m_rowsTranspositions(),
-    m_det_p(0),
     m_l1_norm(0),
+    m_det_p(0),
     m_isInitialized(false)
 {
 }
@@ -275,8 +304,8 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
   : m_lu(size, size),
     m_p(size),
     m_rowsTranspositions(size),
-    m_det_p(0),
     m_l1_norm(0),
+    m_det_p(0),
     m_isInitialized(false)
 {
 }
@@ -284,16 +313,29 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
 template<typename MatrixType>
 template<typename InputType>
 PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
-  : m_lu(matrix.rows(), matrix.rows()),
+  : m_lu(matrix.rows(),matrix.cols()),
     m_p(matrix.rows()),
     m_rowsTranspositions(matrix.rows()),
-    m_det_p(0),
     m_l1_norm(0),
+    m_det_p(0),
     m_isInitialized(false)
 {
   compute(matrix.derived());
 }
 
+template<typename MatrixType>
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
+  : m_lu(matrix.derived()),
+    m_p(matrix.rows()),
+    m_rowsTranspositions(matrix.rows()),
+    m_l1_norm(0),
+    m_det_p(0),
+    m_isInitialized(false)
+{
+  compute();
+}
+
 namespace internal {
 
 /** \internal This is the blocked version of fullpivlu_unblocked() */
@@ -434,7 +476,7 @@ struct partial_lu_impl
       // update permutations and apply them to A_0
       for(Index i=k; i<k+bs; ++i)
       {
-        Index piv = (row_transpositions[i] += k);
+        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
         A_0.row(i).swap(A_0.row(piv));
       }
 
@@ -470,19 +512,17 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
 } // end namespace internal
 
 template<typename MatrixType>
-template<typename InputType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
+void PartialPivLU<MatrixType>::compute()
 {
   check_template_parameters();
 
   // the row permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<NumTraits<int>::highest());
+  eigen_assert(m_lu.rows()<NumTraits<int>::highest());
 
-  m_lu = matrix.derived();
   m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
-  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
-  const Index size = matrix.rows();
+  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = m_lu.rows();
 
   m_rowsTranspositions.resize(size);
 
@@ -493,7 +533,6 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
   m_p = m_rowsTranspositions;
 
   m_isInitialized = true;
-  return *this;
 }
 
 template<typename MatrixType>
@@ -525,12 +564,12 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
 namespace internal {
 
 /***** Implementation of inverse() *****************************************************/
-template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename PartialPivLU<MatrixType>::Scalar>, Dense2Dense>
 {
   typedef PartialPivLU<MatrixType> LuType;
   typedef Inverse<LuType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename LuType::Scalar> &)
   {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
@@ -545,14 +584,12 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
   *
   * \sa class PartialPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::partialPivLu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
-#endif
 
 /** \lu_module
   *
@@ -562,14 +599,12 @@ MatrixBase<Derived>::partialPivLu() const
   *
   * \sa class PartialPivLU
   */
-#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::lu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
-#endif
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/LU/PartialPivLU_MKL.h b/Eigen/src/LU/PartialPivLU_LAPACKE.h
similarity index 77%
rename from Eigen/src/LU/PartialPivLU_MKL.h
rename to Eigen/src/LU/PartialPivLU_LAPACKE.h
index 9035953c8..755168a94 100644
--- a/Eigen/src/LU/PartialPivLU_MKL.h
+++ b/Eigen/src/LU/PartialPivLU_LAPACKE.h
@@ -25,7 +25,7 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
  ********************************************************************************
 */
@@ -33,20 +33,18 @@
 #ifndef EIGEN_PARTIALLU_LAPACK_H
 #define EIGEN_PARTIALLU_LAPACK_H
 
-#include "Eigen/src/Core/util/MKL_support.h"
-
 namespace Eigen { 
 
 namespace internal {
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_LU_PARTPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<int StorageOrder> \
 struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
 { \
   /* \internal performs the LU decomposition in-place of the matrix represented */ \
-  static lapack_int blocked_lu(lapack_int rows, lapack_int cols, EIGTYPE* lu_data, lapack_int luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
+  static lapack_int blocked_lu(Index rows, Index cols, EIGTYPE* lu_data, Index luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
   { \
     EIGEN_UNUSED_VARIABLE(maxBlockSize);\
     lapack_int matrix_order, first_zero_pivot; \
@@ -54,14 +52,14 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
     EIGTYPE* a; \
 /* Set up parameters for ?getrf */ \
     matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    lda = luStride; \
+    lda = convert_index<lapack_int>(luStride); \
     a = lu_data; \
     ipiv = row_transpositions; \
-    m = rows; \
-    n = cols; \
+    m = convert_index<lapack_int>(rows); \
+    n = convert_index<lapack_int>(cols); \
     nb_transpositions = 0; \
 \
-    info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \
+    info = LAPACKE_##LAPACKE_PREFIX##getrf( matrix_order, m, n, (LAPACKE_TYPE*)a, lda, ipiv ); \
 \
     for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
 \
@@ -73,10 +71,10 @@ struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
   } \
 };
 
-EIGEN_MKL_LU_PARTPIV(double, double, d)
-EIGEN_MKL_LU_PARTPIV(float, float, s)
-EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_LU_PARTPIV(double, double, d)
+EIGEN_LAPACKE_LU_PARTPIV(float, float, s)
+EIGEN_LAPACKE_LU_PARTPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_LU_PARTPIV(scomplex, lapack_complex_float,  c)
 
 } // end namespace internal
 
diff --git a/Eigen/src/LU/arch/CMakeLists.txt b/Eigen/src/LU/arch/CMakeLists.txt
deleted file mode 100644
index f6b7ed9ec..000000000
--- a/Eigen/src/LU/arch/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LU_arch_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LU_arch_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel
-  )
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
index e1470c664..ebb64a62b 100644
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ b/Eigen/src/LU/arch/Inverse_SSE.h
@@ -153,10 +153,12 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
     iC = _mm_mul_ps(rd,iC);
     iD = _mm_mul_ps(rd,iD);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
-    result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
-    result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
-    result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
+    Index res_stride = result.outerStride();
+    float* res = result.data();
+    pstoret<float, Packet4f, ResultAlignment>(res+0,            _mm_shuffle_ps(iA,iB,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+res_stride,   _mm_shuffle_ps(iA,iB,0x22));
+    pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
   }
 
 };
@@ -316,14 +318,16 @@ struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
     iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
     iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));     // iA# / det
-    result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));     // iB# / det
-    result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));     // iC# / det
-    result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));     // iD# / det
-    result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
+    Index res_stride = result.outerStride();
+    double* res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res+0,             _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride,    _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2,             _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2,  _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
   }
 };
 
diff --git a/Eigen/src/MetisSupport/CMakeLists.txt b/Eigen/src/MetisSupport/CMakeLists.txt
deleted file mode 100644
index 2bad31416..000000000
--- a/Eigen/src/MetisSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MetisSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_MetisSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/MetisSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/OrderingMethods/CMakeLists.txt b/Eigen/src/OrderingMethods/CMakeLists.txt
deleted file mode 100644
index 9f4bb2758..000000000
--- a/Eigen/src/OrderingMethods/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_OrderingMethods_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_OrderingMethods_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/OrderingMethods COMPONENT Devel
-  )
diff --git a/Eigen/src/PaStiXSupport/CMakeLists.txt b/Eigen/src/PaStiXSupport/CMakeLists.txt
deleted file mode 100644
index 28c657e9b..000000000
--- a/Eigen/src/PaStiXSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_PastixSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_PastixSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/PaStiXSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/PardisoSupport/CMakeLists.txt b/Eigen/src/PardisoSupport/CMakeLists.txt
deleted file mode 100644
index a097ab401..000000000
--- a/Eigen/src/PardisoSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_PardisoSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_PardisoSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/PardisoSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 80d914f25..091c3970e 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -183,7 +183,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     {
       if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0,
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, internal::convert_index<StorageIndex>(m_size),0, 0, 0, m_perm.data(), 0,
                                                           m_iparm.data(), m_msglvl, NULL, NULL);
         m_isInitialized = false;
       }
@@ -194,11 +194,11 @@ class PardisoImpl : public SparseSolverBase<Derived>
       m_type = type;
       bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
-      m_iparm[1] = 3;   // use Metis for the ordering
-      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
+      m_iparm[1] = 2;   // use Metis for the ordering
+      m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
       m_iparm[3] = 0;   // No iterative-direct algorithm
       m_iparm[4] = 0;   // No user fill-in reducing permutation
-      m_iparm[5] = 0;   // Write solution into x
+      m_iparm[5] = 0;   // Write solution into x, b is left unchanged
       m_iparm[6] = 0;   // Not in use
       m_iparm[7] = 2;   // Max numbers of iterative refinement steps
       m_iparm[8] = 0;   // Not in use
@@ -219,7 +219,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
       m_iparm[26] = 0;  // No matrix checker
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
-      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      m_iparm[36] = 0;  // CSR
+      m_iparm[59] = 0;  // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
       
       memset(m_pt, 0, sizeof(m_pt));
     }
@@ -246,7 +247,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     mutable SparseMatrixType m_matrix;
     mutable ComputationInfo m_info;
     bool m_analysisIsOk, m_factorizationIsOk;
-    Index m_type, m_msglvl;
+    StorageIndex m_type, m_msglvl;
     mutable void *m_pt[64];
     mutable ParameterType m_iparm;
     mutable IntColVectorType m_perm;
@@ -265,10 +266,9 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = true;
@@ -287,7 +287,7 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
@@ -306,8 +306,8 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
   
   derived().getMatrix(a);
 
-  Index error;  
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, m_size,
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
                                                             m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
@@ -354,9 +354,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
   }
   
   Index error;
-  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, m_size,
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size),
                                                             m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                            m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
+                                                            m_perm.data(), internal::convert_index<StorageIndex>(nrhs), m_iparm.data(), m_msglvl,
                                                             rhs_ptr, x.derived().data());
 
   manageErrorCode(error);
@@ -371,6 +371,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
   * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
   * \implsparsesolverconcept
@@ -421,6 +424,9 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
   * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
@@ -480,6 +486,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
   * The vectors or matrices X and B can be either dense or sparse.
   *
+  * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+  * \code solver.pardisoParameterArray()[59] = 1; \endcode
+  *
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
diff --git a/Eigen/src/QR/CMakeLists.txt b/Eigen/src/QR/CMakeLists.txt
deleted file mode 100644
index 96f43d7f5..000000000
--- a/Eigen/src/QR/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_QR_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_QR_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/QR COMPONENT Devel
-  )
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 7c559f952..9650781d6 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -41,6 +41,8 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
   * This decomposition performs column pivoting in order to be rank-revealing and improve
   * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
@@ -51,7 +53,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
@@ -59,7 +60,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
     typedef typename MatrixType::RealScalar RealScalar;
     // FIXME should be int
     typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@@ -135,6 +135,27 @@ template<typename _MatrixType> class ColPivHouseholderQR
       compute(matrix.derived());
     }
 
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa ColPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit ColPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_colsPermutation(PermIndexType(matrix.cols())),
+        m_colsTranspositions(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_colNormsUpdated(matrix.cols()),
+        m_colNormsDirect(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -142,9 +163,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -453,21 +471,19 @@ template<typename MatrixType>
 template<typename InputType>
 ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  check_template_parameters();
-
-  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.cols()<=NumTraits<int>::highest());
-
-  m_qr = matrix;
-
+  m_qr = matrix.derived();
   computeInPlace();
-
   return *this;
 }
 
 template<typename MatrixType>
 void ColPivHouseholderQR<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
+  // the column permutation is stored as int indices, so just to be sure:
+  eigen_assert(m_qr.cols()<=NumTraits<int>::highest());
+
   using std::abs;
 
   Index rows = m_qr.rows();
@@ -598,11 +614,11 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 namespace internal {
 
 template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef ColPivHouseholderQR<MatrixType> QrType;
   typedef Inverse<QrType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
@@ -621,7 +637,6 @@ typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHousehol
   return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
-#ifndef __CUDACC__
 /** \return the column-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class ColPivHouseholderQR
@@ -632,7 +647,6 @@ MatrixBase<Derived>::colPivHouseholderQr() const
 {
   return ColPivHouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
similarity index 67%
rename from Eigen/src/QR/ColPivHouseholderQR_MKL.h
rename to Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
index 1203d0d36..4e9651f83 100644
--- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -25,22 +25,20 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Householder QR decomposition of a matrix with column pivoting based on
  *    LAPACKE_?geqp3 function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_QR_COLPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
 template<> template<typename InputType> inline \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
@@ -65,34 +63,35 @@ ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynami
   m_colsPermutation.resize(cols); \
   m_colsPermutation.indices().setZero(); \
 \
-  lapack_int lda = m_qr.outerStride(), i; \
-  lapack_int matrix_order = MKLCOLROW; \
-  LAPACKE_##MKLPREFIX##geqp3( matrix_order, rows, cols, (MKLTYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (MKLTYPE*)m_hCoeffs.data()); \
+  lapack_int lda = internal::convert_index<lapack_int,Index>(m_qr.outerStride()); \
+  lapack_int matrix_order = LAPACKE_COLROW; \
+  LAPACKE_##LAPACKE_PREFIX##geqp3( matrix_order, internal::convert_index<lapack_int,Index>(rows), internal::convert_index<lapack_int,Index>(cols), \
+                              (LAPACKE_TYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (LAPACKE_TYPE*)m_hCoeffs.data()); \
   m_isInitialized = true; \
   m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
   m_hCoeffs.adjointInPlace(); \
   RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
   lapack_int *perm = m_colsPermutation.indices().data(); \
-  for(i=0;i<size;i++) { \
+  for(Index i=0;i<size;i++) { \
     m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
   } \
-  for(i=0;i<cols;i++) perm[i]--;\
+  for(Index i=0;i<cols;i++) perm[i]--;\
 \
   /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
 \
   return *this; \
 }
 
-EIGEN_MKL_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(dcomplex, lapack_complex_double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_QR_COLPIV(scomplex, lapack_complex_float,  c, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
+#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 230d0d23c..41e4ecdfd 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -29,16 +29,19 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
   *
   * \param MatrixType the type of the matrix of which we are computing the COD.
   *
-  * This class performs a rank-revealing complete ortogonal decomposition of a
+  * This class performs a rank-revealing complete orthogonal decomposition of a
   * matrix  \b A into matrices \b P, \b Q, \b T, and \b Z such that
   * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \begin{matrix} \mathbf{T} &
-  *  \mathbf{0} \\ \mathbf{0} & \mathbf{0} \end{matrix} \, \mathbf{Z}
+  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \,
+  *                     \begin{bmatrix} \mathbf{T} &  \mathbf{0} \\
+  *                                     \mathbf{0} & \mathbf{0} \end{bmatrix} \, \mathbf{Z}
   * \f]
   * by using Householder transformations. Here, \b P is a permutation matrix,
   * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of
   * size rank-by-rank. \b A may be rank deficient.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
 template <typename _MatrixType>
@@ -48,16 +51,12 @@ class CompleteOrthogonalDecomposition {
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    Options = MatrixType::Options,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef typename MatrixType::StorageIndex StorageIndex;
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options,
-                 MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-      MatrixQType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
       PermutationType;
@@ -114,12 +113,29 @@ class CompleteOrthogonalDecomposition {
   explicit CompleteOrthogonalDecomposition(const EigenBase<InputType>& matrix)
       : m_cpqr(matrix.rows(), matrix.cols()),
         m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
-        m_temp(matrix.cols()) {
+        m_temp(matrix.cols())
+  {
     compute(matrix.derived());
   }
 
+  /** \brief Constructs a complete orthogonal decomposition from a given matrix
+    *
+    * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+    *
+    * \sa CompleteOrthogonalDecomposition(const EigenBase&)
+    */
+  template<typename InputType>
+  explicit CompleteOrthogonalDecomposition(EigenBase<InputType>& matrix)
+    : m_cpqr(matrix.derived()),
+      m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+      m_temp(matrix.cols())
+  {
+    computeInPlace();
+  }
+
+
   /** This method computes the minimum-norm solution X to a least squares
-   * problem \f[\mathrm{minimize} ||A X - B|| \f], where \b A is the matrix of
+   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
    * which \c *this is the complete orthogonal decomposition.
    *
    * \param B the right-hand sides of the problem to solve.
@@ -165,7 +181,12 @@ class CompleteOrthogonalDecomposition {
   const MatrixType& matrixT() const { return m_cpqr.matrixQR(); }
 
   template <typename InputType>
-  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix);
+  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix) {
+    // Compute the column pivoted QR factorization A P = Q R.
+    m_cpqr.compute(matrix);
+    computeInPlace();
+    return *this;
+  }
 
   /** \returns a const reference to the column permutation matrix */
   const PermutationType& colsPermutation() const {
@@ -354,6 +375,8 @@ class CompleteOrthogonalDecomposition {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
+  void computeInPlace();
+
   /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
    */
   template <typename Rhs>
@@ -384,20 +407,16 @@ CompleteOrthogonalDecomposition<MatrixType>::logAbsDeterminant() const {
  * CompleteOrthogonalDecomposition(const MatrixType&)
  */
 template <typename MatrixType>
-template <typename InputType>
-CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
-    MatrixType>::compute(const EigenBase<InputType>& matrix) {
+void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
+{
   check_template_parameters();
 
   // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.cols() <= NumTraits<int>::highest());
-
-  // Compute the column pivoted QR factorization A P = Q R.
-  m_cpqr.compute(matrix);
+  eigen_assert(m_cpqr.cols() <= NumTraits<int>::highest());
 
   const Index rank = m_cpqr.rank();
-  const Index cols = matrix.cols();
-  const Index rows = matrix.rows();
+  const Index cols = m_cpqr.cols();
+  const Index rows = m_cpqr.rows();
   m_zCoeffs.resize((std::min)(rows, cols));
   m_temp.resize(cols);
 
@@ -443,7 +462,6 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
       }
     }
   }
-  return *this;
 }
 
 template <typename MatrixType>
@@ -509,12 +527,12 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
 
 namespace internal {
 
-template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+template<typename DstXprType, typename MatrixType>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
 {
   typedef CompleteOrthogonalDecomposition<MatrixType> CodType;
   typedef Inverse<CodType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
   {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
   }
@@ -529,7 +547,6 @@ CompleteOrthogonalDecomposition<MatrixType>::householderQ() const {
   return m_cpqr.householderQ();
 }
 
-#ifndef __CUDACC__
 /** \return the complete orthogonal decomposition of \c *this.
   *
   * \sa class CompleteOrthogonalDecomposition
@@ -539,7 +556,6 @@ const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::completeOrthogonalDecomposition() const {
   return CompleteOrthogonalDecomposition<PlainObject>(eval());
 }
-#endif  // __CUDACC__
 
 }  // end namespace Eigen
 
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 32a10f3fe..e0e15100d 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -50,6 +50,8 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
   * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  * 
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
@@ -60,7 +62,6 @@ template<typename _MatrixType> class FullPivHouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
@@ -135,6 +136,26 @@ template<typename _MatrixType> class FullPivHouseholderQR
       compute(matrix.derived());
     }
 
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c MatrixType is a Eigen::Ref.
+      *
+      * \sa FullPivHouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit FullPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false)
+    {
+      computeInPlace();
+    }
+
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * \c *this is the QR decomposition.
       *
@@ -143,9 +164,6 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
       * and an arbitrary solution otherwise.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -430,18 +448,16 @@ template<typename MatrixType>
 template<typename InputType>
 FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  check_template_parameters();
-  
   m_qr = matrix.derived();
-  
   computeInPlace();
-  
   return *this;
 }
 
 template<typename MatrixType>
 void FullPivHouseholderQR<MatrixType>::computeInPlace()
 {
+  check_template_parameters();
+
   using std::abs;
   Index rows = m_qr.rows();
   Index cols = m_qr.cols();
@@ -560,11 +576,11 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 namespace internal {
   
 template<typename DstXprType, typename MatrixType, typename Scalar>
-struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar,Scalar>, Dense2Dense>
 {
   typedef FullPivHouseholderQR<MatrixType> QrType;
   typedef Inverse<QrType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {    
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
@@ -644,7 +660,6 @@ inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouse
   return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
 }
 
-#ifndef __CUDACC__
 /** \return the full-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class FullPivHouseholderQR
@@ -655,7 +670,6 @@ MatrixBase<Derived>::fullPivHouseholderQr() const
 {
   return FullPivHouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 03bc8e6cd..3513d995c 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -37,6 +37,8 @@ namespace Eigen {
   * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
   * FullPivHouseholderQR or ColPivHouseholderQR.
   *
+  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+  *
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
@@ -47,7 +49,6 @@ template<typename _MatrixType> class HouseholderQR
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
@@ -102,6 +103,24 @@ template<typename _MatrixType> class HouseholderQR
       compute(matrix.derived());
     }
 
+
+    /** \brief Constructs a QR factorization from a given matrix
+      *
+      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+      * \c MatrixType is a Eigen::Ref.
+      *
+      * \sa HouseholderQR(const EigenBase&)
+      */
+    template<typename InputType>
+    explicit HouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_temp(matrix.cols()),
+        m_isInitialized(false)
+    {
+      computeInPlace();
+    }
+
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -109,9 +128,6 @@ template<typename _MatrixType> class HouseholderQR
       *
       * \returns a solution.
       *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
       * \note_about_checking_solutions
       *
       * \note_about_arbitrary_choice_of_solution
@@ -151,7 +167,11 @@ template<typename _MatrixType> class HouseholderQR
     }
 
     template<typename InputType>
-    HouseholderQR& compute(const EigenBase<InputType>& matrix);
+    HouseholderQR& compute(const EigenBase<InputType>& matrix) {
+      m_qr = matrix.derived();
+      computeInPlace();
+      return *this;
+    }
 
     /** \returns the absolute value of the determinant of the matrix of which
       * *this is the QR decomposition. It has only linear complexity
@@ -203,6 +223,8 @@ template<typename _MatrixType> class HouseholderQR
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
+
+    void computeInPlace();
     
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
@@ -354,16 +376,14 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
   * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-template<typename InputType>
-HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+void HouseholderQR<MatrixType>::computeInPlace()
 {
   check_template_parameters();
   
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix.derived();
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -371,10 +391,8 @@ HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const EigenBase<In
   internal::householder_qr_inplace_blocked<MatrixType, HCoeffsType>::run(m_qr, m_hCoeffs, 48, m_temp.data());
 
   m_isInitialized = true;
-  return *this;
 }
 
-#ifndef __CUDACC__
 /** \return the Householder QR decomposition of \c *this.
   *
   * \sa class HouseholderQR
@@ -385,7 +403,6 @@ MatrixBase<Derived>::householderQr() const
 {
   return HouseholderQR<PlainObject>(eval());
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/QR/HouseholderQR_MKL.h b/Eigen/src/QR/HouseholderQR_LAPACKE.h
similarity index 80%
rename from Eigen/src/QR/HouseholderQR_MKL.h
rename to Eigen/src/QR/HouseholderQR_LAPACKE.h
index 84ab640a1..1dc7d5363 100644
--- a/Eigen/src/QR/HouseholderQR_MKL.h
+++ b/Eigen/src/QR/HouseholderQR_LAPACKE.h
@@ -25,24 +25,22 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Householder QR decomposition of a matrix w/o pivoting based on
  *    LAPACKE_?geqrf function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_QR_MKL_H
-#define EIGEN_QR_MKL_H
-
-#include "../Core/util/MKL_support.h"
+#ifndef EIGEN_QR_LAPACKE_H
+#define EIGEN_QR_LAPACKE_H
 
 namespace Eigen { 
 
 namespace internal {
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_LAPACKE_QR_NOPIV(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX) \
 template<typename MatrixQR, typename HCoeffs> \
 struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
 { \
@@ -53,18 +51,18 @@ struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
     lapack_int n = (lapack_int) mat.cols(); \
     lapack_int lda = (lapack_int) mat.outerStride(); \
     lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
+    LAPACKE_##LAPACKE_PREFIX##geqrf( matrix_order, m, n, (LAPACKE_TYPE*)mat.data(), lda, (LAPACKE_TYPE*)hCoeffs.data()); \
     hCoeffs.adjointInPlace(); \
   } \
 };
 
-EIGEN_MKL_QR_NOPIV(double, double, d)
-EIGEN_MKL_QR_NOPIV(float, float, s)
-EIGEN_MKL_QR_NOPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_QR_NOPIV(scomplex, MKL_Complex8, c)
+EIGEN_LAPACKE_QR_NOPIV(double, double, d)
+EIGEN_LAPACKE_QR_NOPIV(float, float, s)
+EIGEN_LAPACKE_QR_NOPIV(dcomplex, lapack_complex_double, z)
+EIGEN_LAPACKE_QR_NOPIV(scomplex, lapack_complex_float, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_QR_MKL_H
+#endif // EIGEN_QR_LAPACKE_H
diff --git a/Eigen/src/SPQRSupport/CMakeLists.txt b/Eigen/src/SPQRSupport/CMakeLists.txt
deleted file mode 100644
index 4968beaf2..000000000
--- a/Eigen/src/SPQRSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SPQRSupport_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SPQRSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SPQRSupport/ COMPONENT Devel
-  )
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 3552c87bf..25fca6f4d 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -11,7 +11,7 @@
 // Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
 // Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
 // Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,6 +21,7 @@
 #define EIGEN_BDCSVD_H
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -49,6 +50,18 @@ struct traits<BDCSVD<_MatrixType> >
  *
  * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
  *
+ * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization,
+ * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD.
+ * You can control the switching size with the setSwitchSize() method, default is 16.
+ * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly
+ * recommended and can several order of magnitude faster.
+ *
+ * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
+ * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
+ * significantly degrade the accuracy.
+ *
+ * \sa class JacobiSVD
  */
 template<typename _MatrixType> 
 class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
@@ -228,6 +241,8 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
 #endif
   allocate(matrix.rows(), matrix.cols(), computationOptions);
   using std::abs;
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
   if(matrix.cols() < m_algoswap)
@@ -266,7 +281,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   {
     RealScalar a = abs(m_computed.coeff(i, i));
     m_singularValues.coeffRef(i) = a * scale;
-    if (a == 0)
+    if (a<considerZero)
     {
       m_nonzeroSingularValues = i;
       m_singularValues.tail(m_diagSize - i - 1).setZero();
@@ -380,6 +395,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   using std::abs;
   const Index n = lastCol - firstCol + 1;
   const Index k = n/2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar alphaK;
   RealScalar betaK; 
   RealScalar r0; 
@@ -434,7 +450,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
     f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
   }
   if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
-  if (r0 == 0)
+  if (r0<considerZero)
   {
     c0 = 1;
     s0 = 0;
@@ -553,6 +569,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
   ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
   m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
   ArrayRef diag = m_workspace.head(n);
@@ -575,7 +593,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
-    if(col0(k)!=0)
+    if(abs(col0(k))>considerZero)
       m_workspaceI(m++) = k;
   Map<ArrayXi> perm(m_workspaceI.data(),m);
   
@@ -600,7 +618,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   
   {
     Index actual_n = n;
-    while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
@@ -680,6 +698,7 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar
     res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
   }
   return res;
+
 }
 
 template <typename MatrixType>
@@ -746,14 +765,14 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     RealScalar muPrev, muCur;
     if (shift == left)
     {
-      muPrev = (right - left) * 0.1;
+      muPrev = (right - left) * RealScalar(0.1);
       if (k == actual_n-1) muCur = right - left;
-      else                 muCur = (right - left) * 0.5; 
+      else                 muCur = (right - left) * RealScalar(0.5);
     }
     else
     {
-      muPrev = -(right - left) * 0.1;
-      muCur = -(right - left) * 0.5;
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
     }
 
     RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
@@ -798,15 +817,15 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       RealScalar leftShifted, rightShifted;
       if (shift == left)
       {
-        leftShifted = RealScalar(1)/NumTraits<RealScalar>::highest();
+        leftShifted = (std::numeric_limits<RealScalar>::min)();
         // I don't understand why the case k==0 would be special there:
         // if (k == 0) rightShifted = right - left; else 
-        rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
       }
       else
       {
-        leftShifted = -(right - left) * 0.6;
-        rightShifted = -RealScalar(1)/NumTraits<RealScalar>::highest();
+        leftShifted = -(right - left) * RealScalar(0.6);
+        rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
       
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
@@ -817,7 +836,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
 
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
+      {
+        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
         std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+      }
 #endif
       eigen_internal_assert(fLeft * fRight < 0);
       
@@ -1028,8 +1050,9 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   Diagonal<MatrixXr> fulldiag(m_computed);
   VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
   
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
-  RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
   RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
@@ -1082,7 +1105,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   {
     // Check for total deflation
     // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()==RealScalar(0)).all();
+    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
     
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
@@ -1093,7 +1116,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
       
       // Move deflated diagonal entries at the end.
       for(Index i=1; i<length; ++i)
-        if(diag(i)==0)
+        if(abs(diag(i))<considerZero)
           permutation[p++] = i;
         
       Index i=1, j=k+1;
@@ -1112,7 +1135,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
       for(Index i=1; i<length; ++i)
       {
         Index pi = permutation[i];
-        if(diag(pi)==0 || diag(0)<diag(pi))
+        if(abs(diag(pi))<considerZero || diag(0)<diag(pi))
           permutation[i-1] = permutation[i];
         else
         {
@@ -1163,7 +1186,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   //condition 4.4
   {
     Index i = length-1;
-    while(i>0 && (diag(i)==0 || col0(i)==0)) --i;
+    while(i>0 && (abs(diag(i))<considerZero || abs(col0(i))<considerZero)) --i;
     for(; i>1;--i)
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
@@ -1177,7 +1200,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   for(Index j=2;j<length;++j)
-    assert(diag(j-1)<=diag(j) || diag(j)==0);
+    assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero);
 #endif
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
diff --git a/Eigen/src/SVD/CMakeLists.txt b/Eigen/src/SVD/CMakeLists.txt
deleted file mode 100644
index 55efc44b1..000000000
--- a/Eigen/src/SVD/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SVD_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SVD_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SVD COMPONENT Devel
-  )
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1940c8294..ea2bd62eb 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -419,38 +419,6 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
   }
 };
 
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                         JacobiRotation<RealScalar> *j_left,
-                         JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  using std::abs;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(d == RealScalar(0))
-  {
-    rot1.s() = RealScalar(0);
-    rot1.c() = RealScalar(1);
-  }
-  else
-  {
-    // If d!=0, then t/d cannot overflow because the magnitude of the
-    // entries forming d are not too small compared to the ones forming t.
-    RealScalar u = t / d;
-    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = RealScalar(1) / tmp;
-    rot1.c() = u / tmp;
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left = rot1 * j_right->transpose();
-}
-
 template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
 {
@@ -697,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   // only worsening the precision of U and V as we accumulate more rotations
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  // FIXME What about considerering any denormal numbers as zero, using:
-  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().maxCoeff();
@@ -745,7 +711,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
         {
           finished = false;
           // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          // the complex to real operation returns true is the updated 2x2 block is not already diagonal
+          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
           if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
           {
             JacobiRotation<RealScalar> j_left, j_right;
@@ -759,7 +725,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
             if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
 
             // keep track of the largest diagonal coefficient
-            maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+            maxDiagEntry = numext::maxi<RealScalar>(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
           }
         }
       }
@@ -770,9 +736,22 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
 
   for(Index i = 0; i < m_diagSize; ++i)
   {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    m_singularValues.coeffRef(i) = a;
-    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    // For a complex matrix, some diagonal coefficients might note have been
+    // treated by svd_precondition_2x2_block_to_be_real, and the imaginary part
+    // of some diagonal entry might not be null.
+    if(NumTraits<Scalar>::IsComplex && abs(numext::imag(m_workMatrix.coeff(i,i)))>considerAsZero)
+    {
+      RealScalar a = abs(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU()) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+    }
+    else
+    {
+      // m_workMatrix.coeff(i,i) is already real, no difficulty:
+      RealScalar a = numext::real(m_workMatrix.coeff(i,i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if(computeU() && (a<RealScalar(0))) m_matrixU.col(i) = -m_matrixU.col(i);
+    }
   }
   
   m_singularValues *= scale;
@@ -802,7 +781,6 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   return *this;
 }
 
-#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by two-sided
@@ -816,7 +794,6 @@ MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
 {
   return JacobiSVD<PlainObject>(*this, computationOptions);
 }
-#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/SVD/JacobiSVD_MKL.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
similarity index 62%
rename from Eigen/src/SVD/JacobiSVD_MKL.h
rename to Eigen/src/SVD/JacobiSVD_LAPACKE.h
index 14e461c4e..50272154f 100644
--- a/Eigen/src/SVD/JacobiSVD_MKL.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -25,21 +25,19 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Singular Value Decomposition - SVD.
  ********************************************************************************
 */
 
-#ifndef EIGEN_JACOBISVD_MKL_H
-#define EIGEN_JACOBISVD_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
+#ifndef EIGEN_JACOBISVD_LAPACKE_H
+#define EIGEN_JACOBISVD_LAPACKE_H
 
 namespace Eigen { 
 
-/** \internal Specialization for the data types supported by MKL */
+/** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_MKL_SVD(EIGTYPE, MKLTYPE, MKLRTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW) \
 template<> inline \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>& \
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
@@ -52,41 +50,41 @@ JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPiv
   /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
   m_nonzeroSingularValues = m_diagSize; \
 \
-  lapack_int lda = matrix.outerStride(), ldu, ldvt; \
-  lapack_int matrix_order = MKLCOLROW; \
+  lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt; \
+  lapack_int matrix_order = LAPACKE_COLROW; \
   char jobu, jobvt; \
-  MKLTYPE *u, *vt, dummy; \
+  LAPACKE_TYPE *u, *vt, dummy; \
   jobu  = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
   jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
   if (computeU()) { \
-    ldu  = m_matrixU.outerStride(); \
-    u    = (MKLTYPE*)m_matrixU.data(); \
+    ldu  = internal::convert_index<lapack_int>(m_matrixU.outerStride()); \
+    u    = (LAPACKE_TYPE*)m_matrixU.data(); \
   } else { ldu=1; u=&dummy; }\
   MatrixType localV; \
-  ldvt = (m_computeFullV) ? m_cols : (m_computeThinV) ? m_diagSize : 1; \
+  ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
   if (computeV()) { \
     localV.resize(ldvt, m_cols); \
-    vt   = (MKLTYPE*)localV.data(); \
+    vt   = (LAPACKE_TYPE*)localV.data(); \
   } else { ldvt=1; vt=&dummy; }\
-  Matrix<MKLRTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
+  Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
   MatrixType m_temp; m_temp = matrix; \
-  LAPACKE_##MKLPREFIX##gesvd( matrix_order, jobu, jobvt, m_rows, m_cols, (MKLTYPE*)m_temp.data(), lda, (MKLRTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
+  LAPACKE_##LAPACKE_PREFIX##gesvd( matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(m_rows), internal::convert_index<lapack_int>(m_cols), (LAPACKE_TYPE*)m_temp.data(), lda, (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
   if (computeV()) m_matrixV = localV.adjoint(); \
  /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
   m_isInitialized = true; \
   return *this; \
 }
 
-EIGEN_MKL_SVD(double,   double,        double, d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, ColMajor, LAPACK_COL_MAJOR)
 
-EIGEN_MKL_SVD(double,   double,        double, d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(double,   double,                double, d, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(float,    float,                 float , s, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float,  float , c, RowMajor, LAPACK_ROW_MAJOR)
 
 } // end namespace Eigen
 
-#endif // EIGEN_JACOBISVD_MKL_H
+#endif // EIGEN_JACOBISVD_LAPACKE_H
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index e2d77a761..cc90a3b75 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -130,10 +130,9 @@ public:
   inline Index rank() const
   {
     using std::abs;
-    using std::max;
     eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
     if(m_singularValues.size()==0) return 0;
-    RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
+    RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
     while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
     return i+1;
diff --git a/Eigen/src/SparseCholesky/CMakeLists.txt b/Eigen/src/SparseCholesky/CMakeLists.txt
deleted file mode 100644
index 375a59d7a..000000000
--- a/Eigen/src/SparseCholesky/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseCholesky_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseCholesky_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseCholesky COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseCore/CMakeLists.txt b/Eigen/src/SparseCore/CMakeLists.txt
deleted file mode 100644
index d860452a6..000000000
--- a/Eigen/src/SparseCore/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseCore_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_SparseCore_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseCore COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 0f6835846..492eb0a29 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -143,7 +143,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
     // FIXME, the following heuristic is probably not very good.
-    if(lhs.rows()>=rhs.cols())
+    if(lhs.rows()>rhs.cols())
     {
       ColMajorMatrix resCol(lhs.rows(),rhs.cols());
       // perform sorted insertion
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 4a8dd12e4..fa5386599 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -34,8 +34,8 @@ template<typename OtherDerived>
 inline Derived& SparseMatrixBase<Derived>::operator=(const SparseMatrixBase<OtherDerived>& other)
 {
   // by default sparse evaluation do not alias, so we can safely bypass the generic call_assignment routine
-  internal::Assignment<Derived,OtherDerived,internal::assign_op<Scalar> >
-          ::run(derived(), other.derived(), internal::assign_op<Scalar>());
+  internal::Assignment<Derived,OtherDerived,internal::assign_op<Scalar,typename OtherDerived::Scalar> >
+          ::run(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -124,24 +124,24 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
 }
 
 // Generic Sparse to Sparse assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     assign_sparse_to_sparse(dst.derived(), src.derived());
   }
 };
 
 // Generic Sparse to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
 {
   static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
 
-    if(internal::is_same<Functor,internal::assign_op<Scalar> >::value)
+    if(internal::is_same<Functor,internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> >::value)
       dst.setZero();
     
     internal::evaluator<SrcXprType> srcEval(src);
@@ -156,10 +156,10 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
-struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Sparse2Sparse, Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Sparse2Sparse>
 {
   typedef Solve<DecType,RhsType> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     src.dec()._solve_impl(src.rhs(), dst);
   }
@@ -169,14 +169,15 @@ struct Diagonal2Sparse {};
 
 template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
 
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef typename DstXprType::Scalar Scalar;
   typedef Array<StorageIndex,Dynamic,1> ArrayXI;
   typedef Array<Scalar,Dynamic,1> ArrayXS;
   template<int Options>
-  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     Index size = src.diagonal().size();
     dst.makeCompressed();
@@ -187,15 +188,15 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
   }
   
   template<typename DstDerived>
-  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     dst.diagonal() = src.diagonal();
   }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.diagonal() += src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   { dst.diagonal() -= src.diagonal(); }
 };
 } // end namespace internal
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 82fae8c4b..13e8b0bf1 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -189,9 +189,9 @@ public:
         StorageIndex p = StorageIndex(start);
         for(Index k=0; k<m_outerSize.value(); ++k)
         {
-          Index nnz_k = tmp.innerVector(k).nonZeros();
+          StorageIndex nnz_k = internal::convert_index<StorageIndex>(tmp.innerVector(k).nonZeros());
           if(!m_matrix.isCompressed())
-            matrix.innerNonZeroPtr()[m_outerStart+k] = StorageIndex(nnz_k);
+            matrix.innerNonZeroPtr()[m_outerStart+k] = nnz_k;
           matrix.outerIndexPtr()[m_outerStart+k] = p;
           p += nnz_k;
         }
@@ -504,6 +504,7 @@ template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
  : public EvalIterator
 {
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
   const XprType& m_block;
   Index m_end;
 public:
@@ -528,6 +529,7 @@ public:
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
 {
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
   const unary_evaluator& m_eval;
   Index m_outerPos;
   Index m_innerIndex;
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index 15854a73b..55ad91f46 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -106,6 +106,25 @@ class SparseCompressedBase
     /** \returns whether \c *this is in compressed form. */
     inline bool isCompressed() const { return innerNonZeroPtr()==0; }
 
+    /** \returns a read-only view of the stored coefficients as a 1D array expression.
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * \sa valuePtr(), isCompressed() */
+    const Map<const Array<Scalar,Dynamic,1> > coeffs() const { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
+    /** \returns a read-write view of the stored coefficients as a 1D array expression
+      *
+      * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+      *
+      * Here is an example:
+      * \include SparseMatrix_coeffs.cpp
+      * and the output is:
+      * \include SparseMatrix_coeffs.out
+      *
+      * \sa valuePtr(), isCompressed() */
+    Map<Array<Scalar,Dynamic,1> > coeffs() { eigen_assert(isCompressed()); return Array<Scalar,Dynamic,1>::Map(valuePtr(),nonZeros()); }
+
   protected:
     /** Default constructor. Do nothing. */
     SparseCompressedBase() {}
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index c57d9ac59..aad7b7d79 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -28,6 +28,9 @@ namespace Eigen {
 //                         generic      sparse
 //  4 - dense op dense     product      dense
 //                         generic      dense
+//
+// TODO to ease compiler job, we could specialize product/quotient with a scalar
+//      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
 class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
@@ -165,7 +168,7 @@ public:
   public:
 
     EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
-      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
     {
       this->operator++();
     }
@@ -189,7 +192,7 @@ public:
       return *this;
     }
 
-    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
 
     EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
     EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
@@ -253,7 +256,7 @@ public:
   public:
 
     EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
-      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_value(0), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
     {
       this->operator++();
     }
@@ -277,7 +280,7 @@ public:
       return *this;
     }
 
-    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+    EIGEN_STRONG_INLINE Scalar value() const { eigen_internal_assert(m_id<m_innerSize); return m_value; }
 
     EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
     EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
@@ -323,12 +326,12 @@ protected:
 };
 
 // "sparse .* sparse"
-template<typename T, typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IteratorBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
 {
 protected:
-  typedef scalar_product_op<T> BinaryOp;
+  typedef scalar_product_op<T1,T2> BinaryOp;
   typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
   typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
@@ -407,12 +410,12 @@ protected:
 };
 
 // "dense .* sparse"
-template<typename T, typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IndexBased, IteratorBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
 {
 protected:
-  typedef scalar_product_op<T> BinaryOp;
+  typedef scalar_product_op<T1,T2> BinaryOp;
   typedef evaluator<Lhs>  LhsEvaluator;
   typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
@@ -480,12 +483,12 @@ protected:
 };
 
 // "sparse .* dense"
-template<typename T, typename Lhs, typename Rhs>
-struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IndexBased>
-  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
+template<typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T1,T2>, Lhs, Rhs> >
 {
 protected:
-  typedef scalar_product_op<T> BinaryOp;
+  typedef scalar_product_op<T1,T2> BinaryOp;
   typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
   typedef evaluator<Rhs>  RhsEvaluator;
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
@@ -579,7 +582,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
 {
-  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -587,7 +590,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
 {
-  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
     
@@ -600,31 +603,31 @@ SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) c
 }
 
 template<typename DenseDerived, typename SparseDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
 operator+(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
 {
-  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
 }
 
 template<typename SparseDerived, typename DenseDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
 operator+(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
 {
-  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+  return CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
 }
 
 template<typename DenseDerived, typename SparseDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>
 operator-(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
 {
-  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar,typename SparseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
 }
 
 template<typename SparseDerived, typename DenseDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
 operator-(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
 {
-  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+  return CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar,typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index c9da8a2bb..0547db596 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -72,14 +72,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
 };
 
 // FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
-template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
-struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> >
-{
-  enum {
-    Defined = 1
-  };
-  typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
-};
+// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
+// template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
+// struct ScalarBinaryOpTraits<T1, Ref<T2/*, _Options, _StrideType*/> >
+// {
+//   enum {
+//     Defined = 1
+//   };
+//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
+// };
+
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
 struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType, ColMajor, true>
 {
@@ -95,7 +97,7 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, A
       for(Index j=0; j<lhs.outerSize(); ++j)
       {
 //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
-        typename internal::scalar_product_traits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
+        typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
         for(LhsInnerIterator it(lhsEval,j); it ;++it)
           res.coeffRef(it.index(),c) += it.value() * rhs_j;
       }
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index eb241c3e2..f99be3379 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -166,12 +166,17 @@ class SparseMapBase<Derived,WriteAccessors>
     using Base::innerIndexPtr;
     using Base::outerIndexPtr;
     using Base::innerNonZeroPtr;
-    inline Scalar* valuePtr()       { return Base::m_values; }
+    /** \copydoc SparseMatrix::valuePtr */
+    inline Scalar* valuePtr()              { return Base::m_values; }
+    /** \copydoc SparseMatrix::innerIndexPtr */
     inline StorageIndex* innerIndexPtr()   { return Base::m_innerIndices; }
+    /** \copydoc SparseMatrix::outerIndexPtr */
     inline StorageIndex* outerIndexPtr()   { return Base::m_outerIndex; }
+    /** \copydoc SparseMatrix::innerNonZeroPtr */
     inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
     //----------------------------------------
 
+    /** \copydoc SparseMatrix::coeffRef */
     inline Scalar& coeffRef(Index row, Index col)
     {
       const Index outer = IsRowMajor ? row : col;
@@ -181,14 +186,14 @@ class SparseMapBase<Derived,WriteAccessors>
       Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
       eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
       eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      StorageIndex* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
       const Index id = r - &Base::m_innerIndices[0];
       eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
       return const_cast<Scalar*>(Base::m_values)[id];
     }
     
     inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
-                              Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+                         Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
     {}
 
@@ -233,13 +238,15 @@ class Map<SparseMatrixType>
       * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
       * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
       *
+      * This constructor is available only if \c SparseMatrixType is non-const.
+      *
       * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
       */
     inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
                StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
     {}
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     /** Empty destructor */
     inline ~Map() {}
 };
@@ -254,7 +261,12 @@ class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     enum { IsRowMajor = Base::IsRowMajor };
 
   public:
-
+#endif
+    /** This is the const version of the above constructor.
+      *
+      * This constructor is available only if \c SparseMatrixType is const, e.g.:
+      * \code Map<const SparseMatrix<double> >  \endcode
+      */
     inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
                const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 760e151eb..64ca5fc44 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -35,7 +35,7 @@ namespace Eigen {
   * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
   */
 
 namespace internal {
@@ -440,7 +440,7 @@ class SparseMatrix
     template<typename InputIterators,typename DupFunctor>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
 
-    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar>()); }
+    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar,Scalar>()); }
 
     template<typename DupFunctor>
     void collapseDuplicates(DupFunctor dup_func = DupFunctor());
@@ -979,7 +979,7 @@ template<typename Scalar, int _Options, typename _Index>
 template<typename InputIterators>
 void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
 {
-  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index> >(begin, end, *this, internal::scalar_sum_op<Scalar>());
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index> >(begin, end, *this, internal::scalar_sum_op<Scalar,Scalar>());
 }
 
 /** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
@@ -1080,7 +1080,7 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
     IndexVector positions(dest.outerSize());
     for (Index j=0; j<dest.outerSize(); ++j)
     {
-      Index tmp = dest.m_outerIndex[j];
+      StorageIndex tmp = dest.m_outerIndex[j];
       dest.m_outerIndex[j] = count;
       positions[j] = count;
       count += tmp;
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 2a90f40bf..8816bcff4 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -21,16 +21,10 @@ namespace Eigen {
   * \tparam Derived is the derived type, e.g. a sparse matrix type, or an expression, etc.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
   */
 template<typename Derived> class SparseMatrixBase
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
-                                            EigenBase<Derived> >
-#else
   : public EigenBase<Derived>
-#endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
 
@@ -142,12 +136,20 @@ template<typename Derived> class SparseMatrixBase
     inline Derived& const_cast_derived() const
     { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
 
-    typedef internal::special_scalar_op_base<Derived, Scalar, RealScalar, EigenBase<Derived> > Base;
-    using Base::operator*;
-    using Base::operator/;
+    typedef EigenBase<Derived> Base;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -156,8 +158,10 @@ template<typename Derived> class SparseMatrixBase
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     /** \returns the number of rows. \sa cols() */
     inline Index rows() const { return derived().rows(); }
@@ -263,7 +267,7 @@ template<typename Derived> class SparseMatrixBase
     Derived& operator/=(const Scalar& other);
 
     template<typename OtherDerived> struct CwiseProductDenseReturnType {
-      typedef CwiseBinaryOp<internal::scalar_product_op<typename internal::scalar_product_traits<
+      typedef CwiseBinaryOp<internal::scalar_product_op<typename ScalarBinaryOpTraits<
                                                           typename internal::traits<Derived>::Scalar,
                                                           typename internal::traits<OtherDerived>::Scalar
                                                         >::ReturnType>,
diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index cbd0db71b..7a5ad0635 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -45,7 +45,7 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
 
   // dense += sparse * sparse
   template<typename Dest,typename ActualLhs>
-  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
   {
     typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
     typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
@@ -57,7 +57,7 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
 
   // dense -= sparse * sparse
   template<typename Dest>
-  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type* = 0)
   {
     addTo(dst, -lhs, rhs);
   }
@@ -99,10 +99,10 @@ struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, Produc
 
 // dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
 template< typename DstXprType, typename Lhs, typename Rhs>
-struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
   typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
   {
     generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
   }
@@ -110,10 +110,10 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assig
 
 // dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
 template< typename DstXprType, typename Lhs, typename Rhs>
-struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
   typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
   {
     generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
   }
@@ -121,24 +121,24 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_a
 
 // dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
 template< typename DstXprType, typename Lhs, typename Rhs>
-struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar,typename Product<Lhs,Rhs,AliasFreeProduct>::Scalar>, Sparse2Dense>
 {
   typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &)
   {
     generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
   }
 };
 
 template<typename Lhs, typename Rhs, int Options>
-struct evaluator<SparseView<Product<Lhs, Rhs, Options> > > 
+struct unary_evaluator<SparseView<Product<Lhs, Rhs, Options> >, IteratorBased>
  : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>
 {
   typedef SparseView<Product<Lhs, Rhs, Options> > XprType;
   typedef typename XprType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
-  
-  explicit evaluator(const XprType& xpr)
+
+  explicit unary_evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     using std::abs;
@@ -147,13 +147,13 @@ struct evaluator<SparseView<Product<Lhs, Rhs, Options> > >
     typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
     LhsNested lhsNested(xpr.nestedExpression().lhs());
     RhsNested rhsNested(xpr.nestedExpression().rhs());
-    
+
     internal::sparse_sparse_product_with_pruning_selector<typename remove_all<LhsNested>::type,
                                                           typename remove_all<RhsNested>::type, PlainObject>::run(lhsNested,rhsNested,m_result,
                                                                                                                   abs(xpr.reference())*xpr.epsilon());
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
diff --git a/Eigen/src/SparseCore/SparseRedux.h b/Eigen/src/SparseCore/SparseRedux.h
index 2a9718cfb..458774962 100644
--- a/Eigen/src/SparseCore/SparseRedux.h
+++ b/Eigen/src/SparseCore/SparseRedux.h
@@ -30,7 +30,10 @@ typename internal::traits<SparseMatrix<_Scalar,_Options,_Index> >::Scalar
 SparseMatrix<_Scalar,_Options,_Index>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
+  if(this->isCompressed())
+    return Matrix<Scalar,1,Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
+  else
+    return Base::sum();
 }
 
 template<typename _Scalar, int _Options, typename _Index>
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index b92bb17e2..d31d9babf 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -218,18 +218,18 @@ struct SparseSelfAdjoint2Sparse {};
 template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
 template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
 
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse, Scalar>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
   template<typename DestScalar,int StorageOrder>
-  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
   }
   
   template<typename DestScalar>
-  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
   {
     // TODO directly evaluate into dst;
     SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
@@ -250,11 +250,11 @@ template<int Mode, typename SparseLhsType, typename DenseRhsType, typename Dense
 inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
   EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-  // TODO use alpha
-  eigen_assert(alpha==AlphaType(1) && "alpha != 1 is not implemented yet, sorry");
   
-  typedef evaluator<SparseLhsType> LhsEval;
-  typedef typename evaluator<SparseLhsType>::InnerIterator LhsIterator;
+  typedef typename internal::nested_eval<SparseLhsType,DenseRhsType::MaxColsAtCompileTime>::type SparseLhsTypeNested;
+  typedef typename internal::remove_all<SparseLhsTypeNested>::type SparseLhsTypeNestedCleaned;
+  typedef evaluator<SparseLhsTypeNestedCleaned> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsIterator;
   typedef typename SparseLhsType::Scalar LhsScalar;
   
   enum {
@@ -266,39 +266,53 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
     ProcessSecondHalf = !ProcessFirstHalf
   };
   
-  LhsEval lhsEval(lhs);
-  
-  for (Index j=0; j<lhs.outerSize(); ++j)
+  SparseLhsTypeNested lhs_nested(lhs);
+  LhsEval lhsEval(lhs_nested);
+
+  // work on one column at once
+  for (Index k=0; k<rhs.cols(); ++k)
   {
-    LhsIterator i(lhsEval,j);
-    if (ProcessSecondHalf)
+    for (Index j=0; j<lhs.outerSize(); ++j)
     {
-      while (i && i.index()<j) ++i;
-      if(i && i.index()==j)
+      LhsIterator i(lhsEval,j);
+      // handle diagonal coeff
+      if (ProcessSecondHalf)
       {
-        res.row(j) += i.value() * rhs.row(j);
-        ++i;
+        while (i && i.index()<j) ++i;
+        if(i && i.index()==j)
+        {
+          res(j,k) += alpha * i.value() * rhs(j,k);
+          ++i;
+        }
       }
+
+      // premultiplied rhs for scatters
+      typename ScalarBinaryOpTraits<AlphaType, typename DenseRhsType::Scalar>::ReturnType rhs_j(alpha*rhs(j,k));
+      // accumulator for partial scalar product
+      typename DenseResType::Scalar res_j(0);
+      for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+      {
+        LhsScalar lhs_ij = i.value();
+        if(!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
+        res_j += lhs_ij * rhs(i.index(),k);
+        res(i.index(),k) += numext::conj(lhs_ij) * rhs_j;
+      }
+      res(j,k) += alpha * res_j;
+
+      // handle diagonal coeff
+      if (ProcessFirstHalf && i && (i.index()==j))
+        res(j,k) += alpha * i.value() * rhs(j,k);
     }
-    for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
-    {
-      Index a = LhsIsRowMajor ? j : i.index();
-      Index b = LhsIsRowMajor ? i.index() : j;
-      LhsScalar v = i.value();
-      res.row(a) += (v) * rhs.row(b);
-      res.row(b) += numext::conj(v) * rhs.row(a);
-    }
-    if (ProcessFirstHalf && i && (i.index()==j))
-      res.row(j) += i.value() * rhs.row(j);
   }
 }
 
 
 template<typename LhsView, typename Rhs, int ProductType>
 struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
+: generic_product_impl_base<LhsView, Rhs, generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType> >
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs)
+  static void scaleAndAddTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs, const typename Dest::Scalar& alpha)
   {
     typedef typename LhsView::_MatrixTypeNested Lhs;
     typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
@@ -306,16 +320,16 @@ struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, Pr
     LhsNested lhsNested(lhsView.matrix());
     RhsNested rhsNested(rhs);
     
-    dst.setZero();
-    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, typename Dest::Scalar(1));
+    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, alpha);
   }
 };
 
 template<typename Lhs, typename RhsView, int ProductType>
 struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType>
+: generic_product_impl_base<Lhs, RhsView, generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType> >
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView)
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView, const typename Dest::Scalar& alpha)
   {
     typedef typename RhsView::_MatrixTypeNested Rhs;
     typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
@@ -323,10 +337,9 @@ struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, Pr
     LhsNested lhsNested(lhs);
     RhsNested rhsNested(rhsView.matrix());
     
-    dst.setZero();
-    // transpoe everything
+    // transpose everything
     Transpose<Dest> dstT(dst);
-    internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, typename Dest::Scalar(1));
+    internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
   }
 };
 
@@ -586,12 +599,12 @@ class SparseSymmetricPermutationProduct
 namespace internal {
   
 template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
-struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar>, Sparse2Sparse>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar,typename MatrixType::Scalar>, Sparse2Sparse>
 {
   typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
   typedef typename DstXprType::StorageIndex DstIndex;
   template<int Options>
-  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
   {
     // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
     SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
@@ -600,7 +613,7 @@ struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>
   }
   
   template<typename DestType,unsigned int DestMode>
-  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar,typename MatrixType::Scalar> &)
   {
     internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
   }
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 20078f72c..21c419002 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -51,7 +51,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.reserve(estimated_nnz_prod);
-  double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
+  double ratioColRes = double(estimated_nnz_prod)/(double(lhs.rows())*double(rhs.cols()));
   for (Index j=0; j<cols; ++j)
   {
     // FIXME:
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 167a9886c..00ee6ec89 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -22,7 +22,7 @@ namespace Eigen {
   * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
   */
 
 namespace internal {
diff --git a/Eigen/src/SparseLU/CMakeLists.txt b/Eigen/src/SparseLU/CMakeLists.txt
deleted file mode 100644
index 69729ee89..000000000
--- a/Eigen/src/SparseLU/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseLU_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseLU_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseLU COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index ae3685ac8..fe93aae0b 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -72,14 +72,14 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         
         // load and expand a RN x RK block of B
         Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-                  b01 = pset1<Packet>(Bc1[0]);
-                  b11 = pset1<Packet>(Bc1[1]);
-        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
-        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
+                  { b00 = pset1<Packet>(Bc0[0]); }
+                  { b10 = pset1<Packet>(Bc0[1]); }
+        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
+        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
+                  { b01 = pset1<Packet>(Bc1[0]); }
+                  { b11 = pset1<Packet>(Bc1[1]); }
+        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
+        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
         
         Packet a0, a1, a2, a3, c0, c1, t0, t1;
         
diff --git a/Eigen/src/SparseQR/CMakeLists.txt b/Eigen/src/SparseQR/CMakeLists.txt
deleted file mode 100644
index f9ddf2bdb..000000000
--- a/Eigen/src/SparseQR/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseQR_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseQR_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SparseQR/ COMPONENT Devel
-  )
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index acd7f7e10..2d4498b03 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -705,12 +705,12 @@ struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
 };
 
 template< typename DstXprType, typename SparseQRType>
-struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Sparse>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Sparse>
 {
   typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
   typedef typename DstXprType::Scalar Scalar;
   typedef typename DstXprType::StorageIndex StorageIndex;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
   {
     typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
     idMat.setIdentity();
@@ -721,12 +721,12 @@ struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal:
 };
 
 template< typename DstXprType, typename SparseQRType>
-struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>, Sparse2Dense>
 {
   typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
   typedef typename DstXprType::Scalar Scalar;
   typedef typename DstXprType::StorageIndex StorageIndex;
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
   {
     dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
   }
diff --git a/Eigen/src/StlSupport/CMakeLists.txt b/Eigen/src/StlSupport/CMakeLists.txt
deleted file mode 100644
index 0f094f637..000000000
--- a/Eigen/src/StlSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_StlSupport_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_StlSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/StlSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/SuperLUSupport/CMakeLists.txt b/Eigen/src/SuperLUSupport/CMakeLists.txt
deleted file mode 100644
index b28ebe583..000000000
--- a/Eigen/src/SuperLUSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SuperLUSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_SuperLUSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/SuperLUSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 7e2efd452..88c44bcd0 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -10,15 +10,16 @@
 #ifndef EIGEN_SUPERLUSUPPORT_H
 #define EIGEN_SUPERLUSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
     extern "C" {                                                                                          \
       extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                 char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                 void *, int, SuperMatrix *, SuperMatrix *,                                \
                                 FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                     \
     }                                                                                                     \
     inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
          int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@@ -28,12 +29,37 @@ namespace Eigen {
          FLOATTYPE *recip_pivot_growth,                                                                   \
          FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                                \
+    GlobalLU_t gLU;                                                                                       \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
+         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
+         ferr, berr, &gLU, &mem_usage, stats, info);                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+  }
+#else // version < 5.0
+#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
+    extern "C" {                                                                                          \
+      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
+                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
+                                void *, int, SuperMatrix *, SuperMatrix *,                                \
+                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
+                                mem_usage_t *, SuperLUStat_t *, int *);                                   \
+    }                                                                                                     \
+    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
+         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
+         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
+         SuperMatrix *U, void *work, int lwork,                                                           \
+         SuperMatrix *B, SuperMatrix *X,                                                                  \
+         FLOATTYPE *recip_pivot_growth,                                                                   \
+         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
+         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
+    mem_usage_t mem_usage;                                                                                \
     PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
          ferr, berr, &mem_usage, stats, info);                                                            \
     return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
   }
+#endif
 
 DECL_GSSVX(s,float,float)
 DECL_GSSVX(c,float,std::complex<float>)
diff --git a/Eigen/src/UmfPackSupport/CMakeLists.txt b/Eigen/src/UmfPackSupport/CMakeLists.txt
deleted file mode 100644
index a57de0020..000000000
--- a/Eigen/src/UmfPackSupport/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_UmfPackSupport_SRCS "*.h")
-
-INSTALL(FILES 
-  ${Eigen_UmfPackSupport_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/UmfPackSupport COMPONENT Devel
-  )
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 929a01acb..dc74de935 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -379,7 +379,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     mutable bool m_extractedDataAreDirty;
     
   private:
-    UmfPackLU(UmfPackLU& ) { }
+    UmfPackLU(const UmfPackLU& ) { }
 };
 
 
diff --git a/Eigen/src/misc/CMakeLists.txt b/Eigen/src/misc/CMakeLists.txt
deleted file mode 100644
index a58ffb745..000000000
--- a/Eigen/src/misc/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_misc_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_misc_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/misc COMPONENT Devel
-  )
diff --git a/Eigen/src/misc/RealSvd2x2.h b/Eigen/src/misc/RealSvd2x2.h
new file mode 100644
index 000000000..abb4d3c2f
--- /dev/null
+++ b/Eigen/src/misc/RealSvd2x2.h
@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALSVD2X2_H
+#define EIGEN_REALSVD2X2_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
+                         JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right)
+{
+  using std::sqrt;
+  using std::abs;
+  Matrix<RealScalar,2,2> m;
+  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
+       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
+  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
+
+  if(abs(d) < (std::numeric_limits<RealScalar>::min)())
+  {
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
+  }
+  else
+  {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
+  }
+  m.applyOnTheLeft(0,1,rot1);
+  j_right->makeJacobi(m,0,1);
+  *j_left = rot1 * j_right->transpose();
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_REALSVD2X2_H
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
new file mode 100755
index 000000000..8c7e79b03
--- /dev/null
+++ b/Eigen/src/misc/lapacke.h
@@ -0,0 +1,16291 @@
+/*****************************************************************************
+  Copyright (c) 2010, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK
+* Author: Intel Corporation
+* Generated November, 2011
+*****************************************************************************/
+
+#ifndef _MKL_LAPACKE_H_
+
+#ifndef _LAPACKE_H_
+#define _LAPACKE_H_
+
+/*
+*  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
+*/
+#ifdef HAVE_LAPACK_CONFIG_H
+#include "lapacke_config.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#ifndef lapack_int
+#define lapack_int     int
+#endif
+
+#ifndef lapack_logical
+#define lapack_logical lapack_int
+#endif
+
+/* Complex types are structures equivalent to the
+* Fortran complex types COMPLEX(4) and COMPLEX(8).
+*
+* One can also redefine the types with his own types
+* for example by including in the code definitions like
+*
+* #define lapack_complex_float std::complex<float>
+* #define lapack_complex_double std::complex<double>
+*
+* or define these types in the command line:
+*
+* -Dlapack_complex_float="std::complex<float>"
+* -Dlapack_complex_double="std::complex<double>"
+*/
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+
+/* Complex type (single precision) */
+#ifndef lapack_complex_float
+#include <complex.h>
+#define lapack_complex_float    float _Complex
+#endif
+
+#ifndef lapack_complex_float_real
+#define lapack_complex_float_real(z)       (creal(z))
+#endif
+
+#ifndef lapack_complex_float_imag
+#define lapack_complex_float_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_float lapack_make_complex_float( float re, float im );
+
+/* Complex type (double precision) */
+#ifndef lapack_complex_double
+#include <complex.h>
+#define lapack_complex_double   double _Complex
+#endif
+
+#ifndef lapack_complex_double_real
+#define lapack_complex_double_real(z)      (creal(z))
+#endif
+
+#ifndef lapack_complex_double_imag
+#define lapack_complex_double_imag(z)       (cimag(z))
+#endif
+
+lapack_complex_double lapack_make_complex_double( double re, double im );
+
+#endif
+
+#ifndef LAPACKE_malloc
+#define LAPACKE_malloc( size ) malloc( size )
+#endif
+#ifndef LAPACKE_free
+#define LAPACKE_free( p )      free( p )
+#endif
+
+#define LAPACK_C2INT( x ) (lapack_int)(*((float*)&x ))
+#define LAPACK_Z2INT( x ) (lapack_int)(*((double*)&x ))
+
+#define LAPACK_ROW_MAJOR               101
+#define LAPACK_COL_MAJOR               102
+
+#define LAPACK_WORK_MEMORY_ERROR       -1010
+#define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
+
+/* Callback logical functions of one, two, or three arguments are used
+*  to select eigenvalues to sort to the top left of the Schur form.
+*  The value is selected if function returns TRUE (non-zero). */
+
+typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* );
+typedef lapack_logical (*LAPACK_S_SELECT3)
+    ( const float*, const float*, const float* );
+typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* );
+typedef lapack_logical (*LAPACK_D_SELECT3)
+    ( const double*, const double*, const double* );
+
+typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_C_SELECT2)
+    ( const lapack_complex_float*, const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* );
+typedef lapack_logical (*LAPACK_Z_SELECT2)
+    ( const lapack_complex_double*, const lapack_complex_double* );
+
+#include "lapacke_mangling.h"
+
+#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME)
+lapack_logical LAPACK_lsame( char* ca,  char* cb,
+                              lapack_int lca, lapack_int lcb );
+
+/* C-LAPACK function prototypes */
+
+lapack_int LAPACKE_sbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, float* d, float* e, float* u,
+                           lapack_int ldu, float* vt, lapack_int ldvt, float* q,
+                           lapack_int* iq );
+lapack_int LAPACKE_dbdsdc( int matrix_order, char uplo, char compq,
+                           lapack_int n, double* d, double* e, double* u,
+                           lapack_int ldu, double* vt, lapack_int ldvt,
+                           double* q, lapack_int* iq );
+
+lapack_int LAPACKE_sbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, float* vt, lapack_int ldvt,
+                           float* u, lapack_int ldu, float* c, lapack_int ldc );
+lapack_int LAPACKE_dbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, double* vt, lapack_int ldvt,
+                           double* u, lapack_int ldu, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           float* d, float* e, lapack_complex_float* vt,
+                           lapack_int ldvt, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zbdsqr( int matrix_order, char uplo, lapack_int n,
+                           lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                           double* d, double* e, lapack_complex_double* vt,
+                           lapack_int ldvt, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_sdisna( char job, lapack_int m, lapack_int n, const float* d,
+                           float* sep );
+lapack_int LAPACKE_ddisna( char job, lapack_int m, lapack_int n,
+                           const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq, float* pt,
+                           lapack_int ldpt, float* c, lapack_int ldc );
+lapack_int LAPACKE_dgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq,
+                           double* pt, lapack_int ldpt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_cgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* pt, lapack_int ldpt,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zgbbrd( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int ncc, lapack_int kl,
+                           lapack_int ku, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* pt, lapack_int ldpt,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgbcon( int matrix_order, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* r, float* c, float* rowcnd,
+                           float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, double* r, double* c,
+                           double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgbequ( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const float* ab,
+                            lapack_int ldab, float* r, float* c, float* rowcnd,
+                            float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku, const double* ab,
+                            lapack_int ldab, double* r, double* c,
+                            double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_float* ab, lapack_int ldab,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgbequb( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_double* ab, lapack_int ldab,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* afb,
+                           lapack_int ldafb, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgbrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const float* ab, lapack_int ldab,
+                            const float* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const double* ab, lapack_int ldab,
+                            const double* afb, lapack_int ldafb,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_float* ab,
+                            lapack_int ldab, const lapack_complex_float* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const float* r, const float* c,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbrfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, const lapack_complex_double* ab,
+                            lapack_int ldab, const lapack_complex_double* afb,
+                            lapack_int ldafb, const lapack_int* ipiv,
+                            const double* r, const double* c,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, float* ab,
+                          lapack_int ldab, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs, double* ab,
+                          lapack_int ldab, lapack_int* ipiv, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgbsv( int matrix_order, lapack_int n, lapack_int kl,
+                          lapack_int ku, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, float* ab, lapack_int ldab,
+                           float* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, float* r, float* c, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, double* ab, lapack_int ldab,
+                           double* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, double* r, double* c, double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_float* ab,
+                           lapack_int ldab, lapack_complex_float* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           float* r, float* c, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr, float* rpivot );
+lapack_int LAPACKE_zgbsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_double* ab,
+                           lapack_int ldab, lapack_complex_double* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed,
+                           double* r, double* c, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr, double* rpivot );
+
+lapack_int LAPACKE_sgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, float* ab, lapack_int ldab,
+                            float* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, float* r, float* c, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, double* ab, lapack_int ldab,
+                            double* afb, lapack_int ldafb, lapack_int* ipiv,
+                            char* equed, double* r, double* c, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_cgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_float* ab,
+                            lapack_int ldab, lapack_complex_float* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            float* r, float* c, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zgbsvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int kl, lapack_int ku,
+                            lapack_int nrhs, lapack_complex_double* ab,
+                            lapack_int ldab, lapack_complex_double* afb,
+                            lapack_int ldafb, lapack_int* ipiv, char* equed,
+                            double* r, double* c, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_sgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, float* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, double* ab,
+                           lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgbtrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int kl, lapack_int ku, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* scale,
+                           lapack_int m, lapack_complex_float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_zgebak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* scale,
+                           lapack_int m, lapack_complex_double* v,
+                           lapack_int ldv );
+
+lapack_int LAPACKE_sgebal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           float* scale );
+lapack_int LAPACKE_dgebal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ilo, lapack_int* ihi,
+                           double* scale );
+lapack_int LAPACKE_cgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, float* scale );
+lapack_int LAPACKE_zgebal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ilo, lapack_int* ihi, double* scale );
+
+lapack_int LAPACKE_sgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* d, float* e,
+                           float* tauq, float* taup );
+lapack_int LAPACKE_dgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* d, double* e,
+                           double* tauq, double* taup );
+lapack_int LAPACKE_cgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tauq,
+                           lapack_complex_float* taup );
+lapack_int LAPACKE_zgebrd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tauq,
+                           lapack_complex_double* taup );
+
+lapack_int LAPACKE_sgecon( int matrix_order, char norm, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dgecon( int matrix_order, char norm, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zgecon( int matrix_order, char norm, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_sgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, float* r, float* c,
+                           float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, double* r,
+                           double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+lapack_int LAPACKE_cgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* r, float* c, float* rowcnd, float* colcnd,
+                           float* amax );
+lapack_int LAPACKE_zgeequ( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* r, double* c, double* rowcnd, double* colcnd,
+                           double* amax );
+
+lapack_int LAPACKE_sgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const float* a, lapack_int lda, float* r, float* c,
+                            float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const double* a, lapack_int lda, double* r,
+                            double* c, double* rowcnd, double* colcnd,
+                            double* amax );
+lapack_int LAPACKE_cgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* r, float* c, float* rowcnd, float* colcnd,
+                            float* amax );
+lapack_int LAPACKE_zgeequb( int matrix_order, lapack_int m, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* r, double* c, double* rowcnd,
+                            double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                          lapack_int lda, lapack_int* sdim, float* wr,
+                          float* wi, float* vs, lapack_int ldvs );
+lapack_int LAPACKE_dgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                          lapack_int lda, lapack_int* sdim, double* wr,
+                          double* wi, double* vs, lapack_int ldvs );
+lapack_int LAPACKE_cgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_C_SELECT1 select, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_float* w,
+                          lapack_complex_float* vs, lapack_int ldvs );
+lapack_int LAPACKE_zgees( int matrix_order, char jobvs, char sort,
+                          LAPACK_Z_SELECT1 select, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* sdim, lapack_complex_double* w,
+                          lapack_complex_double* vs, lapack_int ldvs );
+
+lapack_int LAPACKE_sgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_S_SELECT2 select, char sense, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* sdim,
+                           float* wr, float* wi, float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_D_SELECT2 select, char sense, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* sdim,
+                           double* wr, double* wi, double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_C_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_float* w,
+                           lapack_complex_float* vs, lapack_int ldvs,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zgeesx( int matrix_order, char jobvs, char sort,
+                           LAPACK_Z_SELECT1 select, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* sdim, lapack_complex_double* w,
+                           lapack_complex_double* vs, lapack_int ldvs,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* wr,
+                          float* wi, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* wr,
+                          double* wi, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* w, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zgeev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* w,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* wr, float* wi, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* scale,
+                           float* abnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_dgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* wr, double* wi, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* scale,
+                           double* abnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* scale, float* abnrm, float* rconde,
+                           float* rcondv );
+lapack_int LAPACKE_zgeevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* scale, double* abnrm, double* rconde,
+                           double* rcondv );
+
+lapack_int LAPACKE_sgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           float* tau );
+lapack_int LAPACKE_dgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           double* tau );
+lapack_int LAPACKE_cgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* tau );
+lapack_int LAPACKE_zgehrd( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* sva,
+                           float* u, lapack_int ldu, float* v, lapack_int ldv,
+                           float* stat, lapack_int* istat );
+lapack_int LAPACKE_dgejsv( int matrix_order, char joba, char jobu, char jobv,
+                           char jobr, char jobt, char jobp, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* sva,
+                           double* u, lapack_int ldu, double* v, lapack_int ldv,
+                           double* stat, lapack_int* istat );
+
+lapack_int LAPACKE_sgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelq2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgelqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, float* a,
+                          lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs, double* a,
+                          lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgels( int matrix_order, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsd( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_cgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* s, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelss( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* s, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_dgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, lapack_int* jpvt,
+                           double rcond, lapack_int* rank );
+lapack_int LAPACKE_cgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* jpvt, float rcond,
+                           lapack_int* rank );
+lapack_int LAPACKE_zgelsy( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* jpvt, double rcond,
+                           lapack_int* rank );
+
+lapack_int LAPACKE_sgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqlf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqp3( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* jpvt,
+                           float* tau );
+lapack_int LAPACKE_dgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* jpvt,
+                           double* tau );
+lapack_int LAPACKE_cgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqpf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* jpvt, lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqr2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* tau );
+lapack_int LAPACKE_zgeqrfp( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgerfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* r,
+                            const float* c, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgerfsx( int matrix_order, char trans, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* r,
+                            const double* c, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_cgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zgerqf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_sgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, float* a, lapack_int lda, float* s,
+                           float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_dgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, double* a, lapack_int lda, double* s,
+                           double* u, lapack_int ldu, double* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_cgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt );
+lapack_int LAPACKE_zgesdd( int matrix_order, char jobz, lapack_int m,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt );
+
+lapack_int LAPACKE_sgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* a, lapack_int lda, lapack_int* ipiv, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* a, lapack_int lda, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           double* a, lapack_int lda, lapack_int* ipiv,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcgesv( int matrix_order, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* s, float* u, lapack_int ldu, float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_dgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* vt, lapack_int ldvt, double* superb );
+lapack_int LAPACKE_cgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* vt,
+                           lapack_int ldvt, float* superb );
+lapack_int LAPACKE_zgesvd( int matrix_order, char jobu, char jobvt,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* vt,
+                           lapack_int ldvt, double* superb );
+
+lapack_int LAPACKE_sgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* sva, lapack_int mv, float* v, lapack_int ldv,
+                           float* stat );
+lapack_int LAPACKE_dgesvj( int matrix_order, char joba, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* sva, lapack_int mv,
+                           double* v, lapack_int ldv, double* stat );
+
+lapack_int LAPACKE_sgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           float* b, lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_dgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+lapack_int LAPACKE_cgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr,
+                           float* rpivot );
+lapack_int LAPACKE_zgesvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr,
+                           double* rpivot );
+
+lapack_int LAPACKE_sgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            float* b, lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* r, float* c,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zgesvxx( int matrix_order, char fact, char trans,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* r, double* c,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dgetri( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_cgetri( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zgetri( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m, float* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_dggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m, double* v,
+                           lapack_int ldv );
+lapack_int LAPACKE_cggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const float* lscale,
+                           const float* rscale, lapack_int m,
+                           lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zggbak( int matrix_order, char job, char side, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, const double* lscale,
+                           const double* rscale, lapack_int m,
+                           lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sggbal( int matrix_order, char job, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_dggbal( int matrix_order, char job, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+lapack_int LAPACKE_cggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale );
+lapack_int LAPACKE_zggbal( int matrix_order, char job, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale );
+
+lapack_int LAPACKE_sgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_S_SELECT3 selctg, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb,
+                          lapack_int* sdim, float* alphar, float* alphai,
+                          float* beta, float* vsl, lapack_int ldvsl, float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_dgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_D_SELECT3 selctg, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb,
+                          lapack_int* sdim, double* alphar, double* alphai,
+                          double* beta, double* vsl, lapack_int ldvsl,
+                          double* vsr, lapack_int ldvsr );
+lapack_int LAPACKE_cgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_C_SELECT2 selctg, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vsl,
+                          lapack_int ldvsl, lapack_complex_float* vsr,
+                          lapack_int ldvsr );
+lapack_int LAPACKE_zgges( int matrix_order, char jobvsl, char jobvsr, char sort,
+                          LAPACK_Z_SELECT2 selctg, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb,
+                          lapack_int* sdim, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vsl, lapack_int ldvsl,
+                          lapack_complex_double* vsr, lapack_int ldvsr );
+
+lapack_int LAPACKE_sggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_S_SELECT3 selctg, char sense,
+                           lapack_int n, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, lapack_int* sdim, float* alphar,
+                           float* alphai, float* beta, float* vsl,
+                           lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_D_SELECT3 selctg, char sense,
+                           lapack_int n, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, lapack_int* sdim, double* alphar,
+                           double* alphai, double* beta, double* vsl,
+                           lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+lapack_int LAPACKE_cggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_C_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta,
+                           lapack_complex_float* vsl, lapack_int ldvsl,
+                           lapack_complex_float* vsr, lapack_int ldvsr,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_zggesx( int matrix_order, char jobvsl, char jobvsr,
+                           char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_int* sdim,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vsl, lapack_int ldvsl,
+                           lapack_complex_double* vsr, lapack_int ldvsr,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, float* a, lapack_int lda, float* b,
+                          lapack_int ldb, float* alphar, float* alphai,
+                          float* beta, float* vl, lapack_int ldvl, float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_dggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, double* a, lapack_int lda, double* b,
+                          lapack_int ldb, double* alphar, double* alphai,
+                          double* beta, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_cggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr,
+                          lapack_int ldvr );
+lapack_int LAPACKE_zggev( int matrix_order, char jobvl, char jobvr,
+                          lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* alpha,
+                          lapack_complex_double* beta,
+                          lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr );
+
+lapack_int LAPACKE_sggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* vl,
+                           lapack_int ldvl, float* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, float* lscale,
+                           float* rscale, float* abnrm, float* bbnrm,
+                           float* rconde, float* rcondv );
+lapack_int LAPACKE_dggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* vl, lapack_int ldvl, double* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           double* lscale, double* rscale, double* abnrm,
+                           double* bbnrm, double* rconde, double* rcondv );
+lapack_int LAPACKE_cggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                           float* lscale, float* rscale, float* abnrm,
+                           float* bbnrm, float* rconde, float* rcondv );
+lapack_int LAPACKE_zggevx( int matrix_order, char balanc, char jobvl,
+                           char jobvr, char sense, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int* ilo, lapack_int* ihi, double* lscale,
+                           double* rscale, double* abnrm, double* bbnrm,
+                           double* rconde, double* rcondv );
+
+lapack_int LAPACKE_sggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* d, float* x, float* y );
+lapack_int LAPACKE_dggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* d, double* x, double* y );
+lapack_int LAPACKE_cggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* d,
+                           lapack_complex_float* x, lapack_complex_float* y );
+lapack_int LAPACKE_zggglm( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* d,
+                           lapack_complex_double* x, lapack_complex_double* y );
+
+lapack_int LAPACKE_sgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           float* q, lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_cgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd( int matrix_order, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* c, float* d, float* x );
+lapack_int LAPACKE_dgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* c, double* d, double* x );
+lapack_int LAPACKE_cgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* c,
+                           lapack_complex_float* d, lapack_complex_float* x );
+lapack_int LAPACKE_zgglse( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* c,
+                           lapack_complex_double* d, lapack_complex_double* x );
+
+lapack_int LAPACKE_sggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggqrf( int matrix_order, lapack_int n, lapack_int m,
+                           lapack_int p, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, float* a, lapack_int lda, float* taua,
+                           float* b, lapack_int ldb, float* taub );
+lapack_int LAPACKE_dggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, double* a, lapack_int lda,
+                           double* taua, double* b, lapack_int ldb,
+                           double* taub );
+lapack_int LAPACKE_cggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* taua,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* taub );
+lapack_int LAPACKE_zggrqf( int matrix_order, lapack_int m, lapack_int p,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* taua,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* taub );
+
+lapack_int LAPACKE_sggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* iwork );
+lapack_int LAPACKE_dggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alpha, double* beta, double* u,
+                           lapack_int ldu, double* v, lapack_int ldv, double* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           float* alpha, float* beta, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* v,
+                           lapack_int ldv, lapack_complex_float* q,
+                           lapack_int ldq, lapack_int* iwork );
+lapack_int LAPACKE_zggsvd( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l, float* u,
+                           lapack_int ldu, float* v, lapack_int ldv, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq );
+lapack_int LAPACKE_cggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l,
+                           lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zggsvp( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* v,
+                           lapack_int ldv, lapack_complex_double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sgtcon( char norm, lapack_int n, const float* dl,
+                           const float* d, const float* du, const float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dgtcon( char norm, lapack_int n, const double* dl,
+                           const double* d, const double* du, const double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cgtcon( char norm, lapack_int n,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zgtcon( char norm, lapack_int n,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* dlf, const float* df,
+                           const float* duf, const float* du2,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* dlf,
+                           const double* df, const double* duf,
+                           const double* du2, const lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* dlf,
+                           const lapack_complex_float* df,
+                           const lapack_complex_float* duf,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtrfs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* dlf,
+                           const lapack_complex_double* df,
+                           const lapack_complex_double* duf,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* dl, float* d, float* du, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* dl, double* d, double* du, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* dl, lapack_complex_float* d,
+                          lapack_complex_float* du, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zgtsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* dl, lapack_complex_double* d,
+                          lapack_complex_double* du, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const float* dl,
+                           const float* d, const float* du, float* dlf,
+                           float* df, float* duf, float* du2, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs, const double* dl,
+                           const double* d, const double* du, double* dlf,
+                           double* df, double* duf, double* du2,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           lapack_complex_float* dlf, lapack_complex_float* df,
+                           lapack_complex_float* duf, lapack_complex_float* du2,
+                           lapack_int* ipiv, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zgtsvx( int matrix_order, char fact, char trans,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           lapack_complex_double* dlf,
+                           lapack_complex_double* df,
+                           lapack_complex_double* duf,
+                           lapack_complex_double* du2, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sgttrf( lapack_int n, float* dl, float* d, float* du,
+                           float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf( lapack_int n, double* dl, double* d, double* du,
+                           double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf( lapack_int n, lapack_complex_float* dl,
+                           lapack_complex_float* d, lapack_complex_float* du,
+                           lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf( lapack_int n, lapack_complex_double* dl,
+                           lapack_complex_double* d, lapack_complex_double* du,
+                           lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const float* dl, const float* d,
+                           const float* du, const float* du2,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const double* dl, const double* d,
+                           const double* du, const double* du2,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* dl,
+                           const lapack_complex_float* d,
+                           const lapack_complex_float* du,
+                           const lapack_complex_float* du2,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zgttrs( int matrix_order, char trans, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* dl,
+                           const lapack_complex_double* d,
+                           const lapack_complex_double* du,
+                           const lapack_complex_double* du2,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab, float* w, lapack_complex_float* z,
+                          lapack_int ldz );
+lapack_int LAPACKE_zhbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, lapack_complex_double* ab,
+                          lapack_int ldab, double* w, lapack_complex_double* z,
+                          lapack_int ldz );
+
+lapack_int LAPACKE_chbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* x, lapack_int ldx );
+lapack_int LAPACKE_zhbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* x, lapack_int ldx );
+
+lapack_int LAPACKE_chbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* bb, lapack_int ldbb, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* bb, lapack_int ldbb, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_chbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* bb, lapack_int ldbb,
+                           lapack_complex_float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* bb, lapack_int ldbb,
+                           lapack_complex_double* q, lapack_int ldq, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab, float* d, float* e,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zhbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab, double* d, double* e,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_checon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhecon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_cheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zheequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_cheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_cheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_zheevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           double* w );
+
+lapack_int LAPACKE_cheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_zheevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_cheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zheevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhegst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_chegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float* w );
+lapack_int LAPACKE_zhegvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double* w );
+
+lapack_int LAPACKE_chegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zherfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_cherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zherfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_chesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhesv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhesvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zhesvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_chetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda, float* d,
+                           float* e, lapack_complex_float* tau );
+lapack_int LAPACKE_zhetrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda, double* d,
+                           double* e, lapack_complex_double* tau );
+
+lapack_int LAPACKE_chetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zhetrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_chetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_chetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const lapack_complex_float* a, lapack_int lda,
+                          float beta, lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const lapack_complex_double* a, lapack_int lda,
+                          double beta, lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           float* h, lapack_int ldh, float* t, lapack_int ldt,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz );
+lapack_int LAPACKE_dhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           double* h, lapack_int ldh, double* t, lapack_int ldt,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhgeqz( int matrix_order, char job, char compq, char compz,
+                           lapack_int n, lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zhpcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_chpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* ap, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* ap, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_float* ap, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_complex_double* ap, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_float* ap,
+                           const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, lapack_complex_double* ap,
+                           const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_float* ap,
+                          lapack_complex_float* bp, float* w,
+                          lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, lapack_complex_double* ap,
+                          lapack_complex_double* bp, double* w,
+                          lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_float* ap,
+                           lapack_complex_float* bp, float* w,
+                           lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zhpgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, lapack_complex_double* ap,
+                           lapack_complex_double* bp, double* w,
+                           lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_chpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_complex_float* bp,
+                           float vl, float vu, lapack_int il, lapack_int iu,
+                           float abstol, lapack_int* m, float* w,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_complex_double* bp,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zhprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_chpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zhpsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zhpsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_chptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, float* d, float* e,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, double* d, double* e,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zhptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_chptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zhptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_shsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n, const float* h,
+                           lapack_int ldh, float* wr, const float* wi,
+                           float* vl, lapack_int ldvl, float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein( int matrix_order, char job, char eigsrc, char initv,
+                           lapack_logical* select, lapack_int n,
+                           const double* h, lapack_int ldh, double* wr,
+                           const double* wi, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m, lapack_int* ifaill,
+                           lapack_int* ifailr );
+lapack_int LAPACKE_chsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* vl,
+                           lapack_int ldvl, lapack_complex_float* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein( int matrix_order, char job, char eigsrc, char initv,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* vl,
+                           lapack_int ldvl, lapack_complex_double* vr,
+                           lapack_int ldvr, lapack_int mm, lapack_int* m,
+                           lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, float* h,
+                           lapack_int ldh, float* wr, float* wi, float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_dhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi, double* h,
+                           lapack_int ldh, double* wr, double* wi, double* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_chseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_float* h, lapack_int ldh,
+                           lapack_complex_float* w, lapack_complex_float* z,
+                           lapack_int ldz );
+lapack_int LAPACKE_zhseqr( int matrix_order, char job, char compz, lapack_int n,
+                           lapack_int ilo, lapack_int ihi,
+                           lapack_complex_double* h, lapack_int ldh,
+                           lapack_complex_double* w, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x,
+                           lapack_int incx );
+lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_clacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zlacpy( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d( int matrix_order, lapack_int m, lapack_int n,
+                           const float* sa, lapack_int ldsa, double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_dlag2s( int matrix_order, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, float* sa,
+                           lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z( int matrix_order, lapack_int m, lapack_int n,
+                           const lapack_complex_float* sa, lapack_int ldsa,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           float* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_dlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           double* a, lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_clagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* d,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_zlagge( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int kl, lapack_int ku, const double* d,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* iseed );
+
+float LAPACKE_slamch( char cmach );
+double LAPACKE_dlamch( char cmach );
+
+float LAPACKE_slange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const float* a, lapack_int lda );
+double LAPACKE_dlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const double* a, lapack_int lda );
+float LAPACKE_clange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlange( int matrix_order, char norm, lapack_int m,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+float LAPACKE_clanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlanhe( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const float* a, lapack_int lda );
+double LAPACKE_dlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const double* a, lapack_int lda );
+float LAPACKE_clansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda );
+double LAPACKE_zlansy( int matrix_order, char norm, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda );
+
+float LAPACKE_slantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda );
+double LAPACKE_dlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda );
+float LAPACKE_clantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda );
+double LAPACKE_zlantr( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int m, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda );
+
+
+lapack_int LAPACKE_slarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const float* v, lapack_int ldv,
+                           const float* t, lapack_int ldt, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const double* v, lapack_int ldv,
+                           const double* t, lapack_int ldt, double* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_clarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_float* v,
+                           lapack_int ldv, const lapack_complex_float* t,
+                           lapack_int ldt, lapack_complex_float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_zlarfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, const lapack_complex_double* v,
+                           lapack_int ldv, const lapack_complex_double* t,
+                           lapack_int ldt, lapack_complex_double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x,
+                           lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x,
+                           lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha,
+                           lapack_complex_float* x, lapack_int incx,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha,
+                           lapack_complex_double* x, lapack_int incx,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const float* v,
+                           lapack_int ldv, const float* tau, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k, const double* v,
+                           lapack_int ldv, const double* tau, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_clarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft( int matrix_order, char direct, char storev,
+                           lapack_int n, lapack_int k,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const float* v, float tau, float* c,
+                           lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const double* v, double tau, double* c,
+                           lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_float* v,
+                           lapack_complex_float tau, lapack_complex_float* c,
+                           lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx( int matrix_order, char side, lapack_int m,
+                           lapack_int n, const lapack_complex_double* v,
+                           lapack_complex_double tau, lapack_complex_double* c,
+                           lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           float* x );
+lapack_int LAPACKE_dlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           double* x );
+lapack_int LAPACKE_clarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv( lapack_int idist, lapack_int* iseed, lapack_int n,
+                           lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, float alpha, float beta, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, double alpha, double beta, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_claset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_float alpha,
+                           lapack_complex_float beta, lapack_complex_float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_zlaset( int matrix_order, char uplo, lapack_int m,
+                           lapack_int n, lapack_complex_double alpha,
+                           lapack_complex_double beta, lapack_complex_double* a,
+                           lapack_int lda );
+
+lapack_int LAPACKE_slasrt( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp( int matrix_order, lapack_int n, float* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp( int matrix_order, lapack_int n, double* a,
+                           lapack_int lda, lapack_int k1, lapack_int k2,
+                           const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp( int matrix_order, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+lapack_int LAPACKE_zlaswp( int matrix_order, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int k1, lapack_int k2, const lapack_int* ipiv,
+                           lapack_int incx );
+
+lapack_int LAPACKE_slatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, float* d,
+                           lapack_int mode, float cond, float dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlatms( int matrix_order, lapack_int m, lapack_int n,
+                           char dist, lapack_int* iseed, char sym, double* d,
+                           lapack_int mode, double cond, double dmax,
+                           lapack_int kl, lapack_int ku, char pack,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slauum( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dlauum( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_clauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const float* tau, float* q,
+                           lapack_int ldq );
+lapack_int LAPACKE_dopgtr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const double* tau, double* q,
+                           lapack_int ldq );
+
+lapack_int LAPACKE_sopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* ap,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dopmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* ap,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, float* a, lapack_int lda,
+                           const float* tau );
+lapack_int LAPACKE_dorgrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, double* a, lapack_int lda,
+                           const double* tau );
+
+lapack_int LAPACKE_sorgtr( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const float* tau );
+lapack_int LAPACKE_dorgtr( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const double* tau );
+
+lapack_int LAPACKE_sormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const float* a, lapack_int lda, const float* tau,
+                           float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const double* a, lapack_int lda, const double* tau,
+                           double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const float* a, lapack_int lda,
+                           const float* tau, float* c, lapack_int ldc );
+lapack_int LAPACKE_dormrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const double* a, lapack_int lda,
+                           const double* tau, double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* tau, float* c,
+                           lapack_int ldc );
+lapack_int LAPACKE_dormtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* tau, double* c,
+                           lapack_int ldc );
+
+lapack_int LAPACKE_spbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float anorm, float* rcond );
+lapack_int LAPACKE_zpbcon( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double anorm, double* rcond );
+
+lapack_int LAPACKE_spbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const float* ab, lapack_int ldab,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const double* ab, lapack_int ldab,
+                           double* s, double* scond, double* amax );
+lapack_int LAPACKE_cpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_float* ab,
+                           lapack_int ldab, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_zpbequ( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, const lapack_complex_double* ab,
+                           lapack_int ldab, double* s, double* scond,
+                           double* amax );
+
+lapack_int LAPACKE_spbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, const float* afb, lapack_int ldafb,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, const double* afb, lapack_int ldafb,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_spbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_float* bb,
+                           lapack_int ldbb );
+lapack_int LAPACKE_zpbstf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kb, lapack_complex_double* bb,
+                           lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, float* ab,
+                          lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs, double* ab,
+                          lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int kd, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, float* ab,
+                           lapack_int ldab, float* afb, lapack_int ldafb,
+                           char* equed, float* s, float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, double* ab,
+                           lapack_int ldab, double* afb, lapack_int ldafb,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* afb, lapack_int ldafb,
+                           char* equed, float* s, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpbsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* afb, lapack_int ldafb,
+                           char* equed, double* s, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+
+lapack_int LAPACKE_spbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_float* ab,
+                           lapack_int ldab );
+lapack_int LAPACKE_zpbtrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_complex_double* ab,
+                           lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const float* ab,
+                           lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs, const double* ab,
+                           lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_cpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs( int matrix_order, char transr, char uplo,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dpocon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_cpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_zpocon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double anorm, double* rcond );
+
+lapack_int LAPACKE_spoequ( int matrix_order, lapack_int n, const float* a,
+                           lapack_int lda, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dpoequ( int matrix_order, lapack_int n, const double* a,
+                           lapack_int lda, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ( int matrix_order, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb( int matrix_order, lapack_int n, const float* a,
+                            lapack_int lda, float* s, float* scond,
+                            float* amax );
+lapack_int LAPACKE_dpoequb( int matrix_order, lapack_int n, const double* a,
+                            lapack_int lda, double* s, double* scond,
+                            double* amax );
+lapack_int LAPACKE_cpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb( int matrix_order, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_zporfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+
+lapack_int LAPACKE_sporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const float* s, const float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const double* s, const double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const float* s, const lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zporfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const double* s, const lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_sposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zposv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dsposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           lapack_int* iter );
+lapack_int LAPACKE_zcposv( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* iter );
+
+lapack_int LAPACKE_sposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* a, lapack_int lda, float* af,
+                           lapack_int ldaf, char* equed, float* s, float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_dposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, char* equed, double* s,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zposvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_sposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            char* equed, float* s, float* b, lapack_int ldb,
+                            float* x, lapack_int ldx, float* rcond,
+                            float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_dposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            char* equed, double* s, double* b, lapack_int ldb,
+                            double* x, lapack_int ldx, double* rcond,
+                            double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+lapack_int LAPACKE_cposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            char* equed, float* s, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* x,
+                            lapack_int ldx, float* rcond, float* rpvgrw,
+                            float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zposvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            char* equed, double* s, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* x,
+                            lapack_int ldx, double* rcond, double* rpvgrw,
+                            double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_spotrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dpotri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_cpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zpotrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sppcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float anorm, float* rcond );
+lapack_int LAPACKE_dppcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double anorm, double* rcond );
+lapack_int LAPACKE_cppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zppcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sppequ( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* s, float* scond,
+                           float* amax );
+lapack_int LAPACKE_dppequ( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* s, double* scond,
+                           double* amax );
+lapack_int LAPACKE_cppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap, float* s,
+                           float* scond, float* amax );
+lapack_int LAPACKE_zppequ( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap, double* s,
+                           double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_cpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zpprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_cppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, float* ap, float* afp, char* equed,
+                           float* s, float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, double* ap, double* afp,
+                           char* equed, double* s, double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_float* ap,
+                           lapack_complex_float* afp, char* equed, float* s,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zppsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, lapack_complex_double* ap,
+                           lapack_complex_double* afp, char* equed, double* s,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spptrf( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri( int matrix_order, char uplo, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dpptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_cpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           float tol );
+lapack_int LAPACKE_dpstrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* piv, lapack_int* rank,
+                           double tol );
+lapack_int LAPACKE_cpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, float tol );
+lapack_int LAPACKE_zpstrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* piv, lapack_int* rank, double tol );
+
+lapack_int LAPACKE_sptcon( lapack_int n, const float* d, const float* e,
+                           float anorm, float* rcond );
+lapack_int LAPACKE_dptcon( lapack_int n, const double* d, const double* e,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cptcon( lapack_int n, const float* d,
+                           const lapack_complex_float* e, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_zptcon( lapack_int n, const double* d,
+                           const lapack_complex_double* e, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_spteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cpteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zpteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, const float* df,
+                           const float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dptrfs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, const double* df,
+                           const double* ef, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_cptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, const float* df,
+                           const lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zptrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, const double* df,
+                           const lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, double* e, double* b, lapack_int ldb );
+lapack_int LAPACKE_cptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          float* d, lapack_complex_float* e,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv( int matrix_order, lapack_int n, lapack_int nrhs,
+                          double* d, lapack_complex_double* e,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d, const float* e,
+                           float* df, float* ef, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d, const double* e,
+                           double* df, double* ef, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+lapack_int LAPACKE_cptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e, float* df,
+                           lapack_complex_float* ef,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zptsvx( int matrix_order, char fact, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e, double* df,
+                           lapack_complex_double* ef,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_spttrf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf( lapack_int n, float* d, lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf( lapack_int n, double* d, lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const float* d, const float* e, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dpttrs( int matrix_order, lapack_int n, lapack_int nrhs,
+                           const double* d, const double* e, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_cpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* d,
+                           const lapack_complex_float* e,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* d,
+                           const lapack_complex_double* e,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, float* ab, lapack_int ldab, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int kd, double* ab, lapack_int ldab, double* w,
+                          double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* w,
+                           float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, float* ab,
+                           lapack_int ldab, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int kd, double* ab,
+                           lapack_int ldab, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, const float* bb, lapack_int ldbb,
+                           float* x, lapack_int ldx );
+lapack_int LAPACKE_dsbgst( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, const double* bb, lapack_int ldbb,
+                           double* x, lapack_int ldx );
+
+lapack_int LAPACKE_ssbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, float* ab,
+                          lapack_int ldab, float* bb, lapack_int ldbb, float* w,
+                          float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgv( int matrix_order, char jobz, char uplo, lapack_int n,
+                          lapack_int ka, lapack_int kb, double* ab,
+                          lapack_int ldab, double* bb, lapack_int ldbb,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, float* ab,
+                           lapack_int ldab, float* bb, lapack_int ldbb,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsbgvd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           lapack_int ka, lapack_int kb, double* ab,
+                           lapack_int ldab, double* bb, lapack_int ldbb,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_ssbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           float* ab, lapack_int ldab, float* bb,
+                           lapack_int ldbb, float* q, lapack_int ldq, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, lapack_int ka, lapack_int kb,
+                           double* ab, lapack_int ldab, double* bb,
+                           lapack_int ldbb, double* q, lapack_int ldq,
+                           double vl, double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, float* ab, lapack_int ldab, float* d,
+                           float* e, float* q, lapack_int ldq );
+lapack_int LAPACKE_dsbtrd( int matrix_order, char vect, char uplo, lapack_int n,
+                           lapack_int kd, double* ab, lapack_int ldab,
+                           double* d, double* e, double* q, lapack_int ldq );
+
+lapack_int LAPACKE_ssfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, float alpha,
+                          const float* a, lapack_int lda, float beta,
+                          float* c );
+lapack_int LAPACKE_dsfrk( int matrix_order, char transr, char uplo, char trans,
+                          lapack_int n, lapack_int k, double alpha,
+                          const double* a, lapack_int lda, double beta,
+                          double* c );
+
+lapack_int LAPACKE_sspcon( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, const lapack_int* ipiv, float anorm,
+                           float* rcond );
+lapack_int LAPACKE_dspcon( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, const lapack_int* ipiv,
+                           double anorm, double* rcond );
+lapack_int LAPACKE_cspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zspcon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_sspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* ap, float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* ap, double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* ap, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dspevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* ap, double vl, double vu,
+                           lapack_int il, lapack_int iu, double abstol,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* ap, float* bp,
+                          float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* ap, double* bp,
+                          double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* ap, float* bp,
+                           float* w, float* z, lapack_int ldz );
+lapack_int LAPACKE_dspgvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* ap, double* bp,
+                           double* w, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* ap,
+                           float* bp, float vl, float vu, lapack_int il,
+                           lapack_int iu, float abstol, lapack_int* m, float* w,
+                           float* z, lapack_int ldz, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* ap,
+                           double* bp, double vl, double vu, lapack_int il,
+                           lapack_int iu, double abstol, lapack_int* m,
+                           double* w, double* z, lapack_int ldz,
+                           lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, const float* afp,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, const double* afp,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_complex_float* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsprfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_complex_double* afp,
+                           const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_sspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* ap, lapack_int* ipiv,
+                          float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* ap, lapack_int* ipiv,
+                          double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* ap,
+                          lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_zspsv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* ap,
+                          lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap, float* afp,
+                           lapack_int* ipiv, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap, double* afp,
+                           lapack_int* ipiv, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_cspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           lapack_complex_float* afp, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zspsvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           lapack_complex_double* afp, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssptrd( int matrix_order, char uplo, lapack_int n, float* ap,
+                           float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd( int matrix_order, char uplo, lapack_int n,
+                           double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf( int matrix_order, char uplo, lapack_int n, float* ap,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf( int matrix_order, char uplo, lapack_int n,
+                           double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri( int matrix_order, char uplo, lapack_int n, float* ap,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_dsptri( int matrix_order, char uplo, lapack_int n,
+                           double* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_csptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* ap, const lapack_int* ipiv );
+lapack_int LAPACKE_zsptri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* ap, const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* ap,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* ap,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* ap,
+                           const lapack_int* ipiv, lapack_complex_float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_zsptrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* ap,
+                           const lapack_int* ipiv, lapack_complex_double* b,
+                           lapack_int ldb );
+
+lapack_int LAPACKE_sstebz( char range, char order, lapack_int n, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           const float* d, const float* e, lapack_int* m,
+                           lapack_int* nsplit, float* w, lapack_int* iblock,
+                           lapack_int* isplit );
+lapack_int LAPACKE_dstebz( char range, char order, lapack_int n, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, const double* d, const double* e,
+                           lapack_int* m, lapack_int* nsplit, double* w,
+                           lapack_int* iblock, lapack_int* isplit );
+
+lapack_int LAPACKE_sstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_cstedc( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zstedc( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_sstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_cstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* isuppz );
+lapack_int LAPACKE_zstegr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* isuppz );
+
+lapack_int LAPACKE_sstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           float* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_dstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           double* z, lapack_int ldz, lapack_int* ifailv );
+lapack_int LAPACKE_cstein( int matrix_order, lapack_int n, const float* d,
+                           const float* e, lapack_int m, const float* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int* ifailv );
+lapack_int LAPACKE_zstein( int matrix_order, lapack_int n, const double* d,
+                           const double* e, lapack_int m, const double* w,
+                           const lapack_int* iblock, const lapack_int* isplit,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, float* z, lapack_int ldz, lapack_int nzc,
+                           lapack_int* isuppz, lapack_logical* tryrac );
+lapack_int LAPACKE_dstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, double* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_cstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, lapack_int* m,
+                           float* w, lapack_complex_float* z, lapack_int ldz,
+                           lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+lapack_int LAPACKE_zstemr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           lapack_int* m, double* w, lapack_complex_double* z,
+                           lapack_int ldz, lapack_int nzc, lapack_int* isuppz,
+                           lapack_logical* tryrac );
+
+lapack_int LAPACKE_ssteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, double* z, lapack_int ldz );
+lapack_int LAPACKE_csteqr( int matrix_order, char compz, lapack_int n, float* d,
+                           float* e, lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zsteqr( int matrix_order, char compz, lapack_int n,
+                           double* d, double* e, lapack_complex_double* z,
+                           lapack_int ldz );
+
+lapack_int LAPACKE_ssterf( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev( int matrix_order, char jobz, lapack_int n, float* d,
+                          float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstev( int matrix_order, char jobz, lapack_int n, double* d,
+                          double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevd( int matrix_order, char jobz, lapack_int n, float* d,
+                           float* e, float* z, lapack_int ldz );
+lapack_int LAPACKE_dstevd( int matrix_order, char jobz, lapack_int n, double* d,
+                           double* e, double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dstevr( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_sstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, float* d, float* e, float vl, float vu,
+                           lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dstevx( int matrix_order, char jobz, char range,
+                           lapack_int n, double* d, double* e, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_dsycon( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+lapack_int LAPACKE_csycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv, float anorm, float* rcond );
+lapack_int LAPACKE_zsycon( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv, double anorm,
+                           double* rcond );
+
+lapack_int LAPACKE_ssyequb( int matrix_order, char uplo, lapack_int n,
+                            const float* a, lapack_int lda, float* s,
+                            float* scond, float* amax );
+lapack_int LAPACKE_dsyequb( int matrix_order, char uplo, lapack_int n,
+                            const double* a, lapack_int lda, double* s,
+                            double* scond, double* amax );
+lapack_int LAPACKE_csyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            float* s, float* scond, float* amax );
+lapack_int LAPACKE_zsyequb( int matrix_order, char uplo, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_ssyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyev( int matrix_order, char jobz, char uplo, lapack_int n,
+                          double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           float* a, lapack_int lda, float* w );
+lapack_int LAPACKE_dsyevd( int matrix_order, char jobz, char uplo, lapack_int n,
+                           double* a, lapack_int lda, double* w );
+
+lapack_int LAPACKE_ssyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* isuppz );
+lapack_int LAPACKE_dsyevr( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* isuppz );
+
+lapack_int LAPACKE_ssyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, float* a, lapack_int lda, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsyevx( int matrix_order, char jobz, char range, char uplo,
+                           lapack_int n, double* a, lapack_int lda, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, float* a, lapack_int lda,
+                           const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst( int matrix_order, lapack_int itype, char uplo,
+                           lapack_int n, double* a, lapack_int lda,
+                           const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygv( int matrix_order, lapack_int itype, char jobz,
+                          char uplo, lapack_int n, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float* w );
+lapack_int LAPACKE_dsygvd( int matrix_order, lapack_int itype, char jobz,
+                           char uplo, lapack_int n, double* a, lapack_int lda,
+                           double* b, lapack_int ldb, double* w );
+
+lapack_int LAPACKE_ssygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float vl,
+                           float vu, lapack_int il, lapack_int iu, float abstol,
+                           lapack_int* m, float* w, float* z, lapack_int ldz,
+                           lapack_int* ifail );
+lapack_int LAPACKE_dsygvx( int matrix_order, lapack_int itype, char jobz,
+                           char range, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double vl,
+                           double vu, lapack_int il, lapack_int iu,
+                           double abstol, lapack_int* m, double* w, double* z,
+                           lapack_int ldz, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const float* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const double* af, lapack_int ldaf,
+                           const lapack_int* ipiv, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_csyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_zsyrfs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_ssyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const float* a,
+                            lapack_int lda, const float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const float* b, lapack_int ldb, float* x,
+                            lapack_int ldx, float* rcond, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs, const double* a,
+                            lapack_int lda, const double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const double* b, lapack_int ldb, double* x,
+                            lapack_int ldx, double* rcond, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const float* s,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* berr, lapack_int n_err_bnds,
+                            float* err_bnds_norm, float* err_bnds_comp,
+                            lapack_int nparams, float* params );
+lapack_int LAPACKE_zsyrfsx( int matrix_order, char uplo, char equed,
+                            lapack_int n, lapack_int nrhs,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* af, lapack_int ldaf,
+                            const lapack_int* ipiv, const double* s,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* berr, lapack_int n_err_bnds,
+                            double* err_bnds_norm, double* err_bnds_comp,
+                            lapack_int nparams, double* params );
+
+lapack_int LAPACKE_ssysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, float* a, lapack_int lda,
+                          lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, double* a, lapack_int lda,
+                          lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsysv( int matrix_order, char uplo, lapack_int n,
+                          lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           float* af, lapack_int ldaf, lapack_int* ipiv,
+                           const float* b, lapack_int ldb, float* x,
+                           lapack_int ldx, float* rcond, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           double* af, lapack_int ldaf, lapack_int* ipiv,
+                           const double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* rcond, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_csysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* ferr, float* berr );
+lapack_int LAPACKE_zsysvx( int matrix_order, char fact, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* af,
+                           lapack_int ldaf, lapack_int* ipiv,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* ferr, double* berr );
+
+lapack_int LAPACKE_ssysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, float* a,
+                            lapack_int lda, float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s, float* b,
+                            lapack_int ldb, float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_dsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs, double* a,
+                            lapack_int lda, double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s, double* b,
+                            lapack_int ldb, double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+lapack_int LAPACKE_csysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, float* s,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* x, lapack_int ldx,
+                            float* rcond, float* rpvgrw, float* berr,
+                            lapack_int n_err_bnds, float* err_bnds_norm,
+                            float* err_bnds_comp, lapack_int nparams,
+                            float* params );
+lapack_int LAPACKE_zsysvxx( int matrix_order, char fact, char uplo,
+                            lapack_int n, lapack_int nrhs,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* af, lapack_int ldaf,
+                            lapack_int* ipiv, char* equed, double* s,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* x, lapack_int ldx,
+                            double* rcond, double* rpvgrw, double* berr,
+                            lapack_int n_err_bnds, double* err_bnds_norm,
+                            double* err_bnds_comp, lapack_int nparams,
+                            double* params );
+
+lapack_int LAPACKE_ssytrd( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsytrd( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssytrf( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dsytrf( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_csytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_int* ipiv );
+lapack_int LAPACKE_zsytrf( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytri( int matrix_order, char uplo, lapack_int n, float* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri( int matrix_order, char uplo, lapack_int n, double* a,
+                           lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_csytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv );
+
+lapack_int LAPACKE_ssytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs( int matrix_order, char uplo, lapack_int n,
+                           lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const float* ab,
+                           lapack_int ldab, float* rcond );
+lapack_int LAPACKE_dtbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd, const double* ab,
+                           lapack_int ldab, double* rcond );
+lapack_int LAPACKE_ctbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* rcond );
+lapack_int LAPACKE_ztbcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, lapack_int kd,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           double* rcond );
+
+lapack_int LAPACKE_stbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, const float* b,
+                           lapack_int ldb, const float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_dtbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, const double* b,
+                           lapack_int ldb, const double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+lapack_int LAPACKE_ctbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztbrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const float* ab, lapack_int ldab, float* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_dtbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const double* ab, lapack_int ldab, double* b,
+                           lapack_int ldb );
+lapack_int LAPACKE_ctbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztbtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int kd, lapack_int nrhs,
+                           const lapack_complex_double* ab, lapack_int ldab,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          float alpha, const float* a, float* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_dtfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          double alpha, const double* a, double* b,
+                          lapack_int ldb );
+lapack_int LAPACKE_ctfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_float alpha,
+                          const lapack_complex_float* a,
+                          lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm( int matrix_order, char transr, char side, char uplo,
+                          char trans, char diag, lapack_int m, lapack_int n,
+                          lapack_complex_double alpha,
+                          const lapack_complex_double* a,
+                          lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, float* a );
+lapack_int LAPACKE_dtftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, double* a );
+lapack_int LAPACKE_ctftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_ztftri( int matrix_order, char transr, char uplo, char diag,
+                           lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* arf, float* a,
+                           lapack_int lda );
+lapack_int LAPACKE_dtfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* arf, double* a,
+                           lapack_int lda );
+lapack_int LAPACKE_ctfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* arf,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* arf,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* s, lapack_int lds, const float* p,
+                           lapack_int ldp, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* s, lapack_int lds, const double* p,
+                           lapack_int ldp, double* vl, lapack_int ldvl,
+                           double* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* s, lapack_int lds,
+                           const lapack_complex_float* p, lapack_int ldp,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* s, lapack_int lds,
+                           const lapack_complex_double* p, lapack_int ldp,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_stgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* q,
+                           lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc( int matrix_order, lapack_logical wantq,
+                           lapack_logical wantz, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alphar, float* alphai, float* beta, float* q,
+                           lapack_int ldq, float* z, lapack_int ldz,
+                           lapack_int* m, float* pl, float* pr, float* dif );
+lapack_int LAPACKE_dtgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           double* alphar, double* alphai, double* beta,
+                           double* q, lapack_int ldq, double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+lapack_int LAPACKE_ctgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* alpha,
+                           lapack_complex_float* beta, lapack_complex_float* q,
+                           lapack_int ldq, lapack_complex_float* z,
+                           lapack_int ldz, lapack_int* m, float* pl, float* pr,
+                           float* dif );
+lapack_int LAPACKE_ztgsen( int matrix_order, lapack_int ijob,
+                           lapack_logical wantq, lapack_logical wantz,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* alpha,
+                           lapack_complex_double* beta,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* z, lapack_int ldz,
+                           lapack_int* m, double* pl, double* pr, double* dif );
+
+lapack_int LAPACKE_stgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, float* a, lapack_int lda,
+                           float* b, lapack_int ldb, float tola, float tolb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, double* alpha,
+                           double* beta, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int ldb, float tola, float tolb, float* alpha,
+                           float* beta, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja( int matrix_order, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_int k, lapack_int l, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int ldb, double tola, double tolb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, const float* vl, lapack_int ldvl,
+                           const float* vr, lapack_int ldvr, float* s,
+                           float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, const double* vl, lapack_int ldvl,
+                           const double* vr, lapack_int ldvr, double* s,
+                           double* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* dif, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztgsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* dif, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_stgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           float* c, lapack_int ldc, const float* d,
+                           lapack_int ldd, const float* e, lapack_int lde,
+                           float* f, lapack_int ldf, float* scale, float* dif );
+lapack_int LAPACKE_dtgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           double* c, lapack_int ldc, const double* d,
+                           lapack_int ldd, const double* e, lapack_int lde,
+                           double* f, lapack_int ldf, double* scale,
+                           double* dif );
+lapack_int LAPACKE_ctgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           const lapack_complex_float* d, lapack_int ldd,
+                           const lapack_complex_float* e, lapack_int lde,
+                           lapack_complex_float* f, lapack_int ldf,
+                           float* scale, float* dif );
+lapack_int LAPACKE_ztgsyl( int matrix_order, char trans, lapack_int ijob,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           const lapack_complex_double* d, lapack_int ldd,
+                           const lapack_complex_double* e, lapack_int lde,
+                           lapack_complex_double* f, lapack_int ldf,
+                           double* scale, double* dif );
+
+lapack_int LAPACKE_stpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* ap, float* rcond );
+lapack_int LAPACKE_dtpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* ap, double* rcond );
+lapack_int LAPACKE_ctpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* ap,
+                           float* rcond );
+lapack_int LAPACKE_ztpcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* ap,
+                           double* rcond );
+
+lapack_int LAPACKE_stprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           const float* b, lapack_int ldb, const float* x,
+                           lapack_int ldx, float* ferr, float* berr );
+lapack_int LAPACKE_dtprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           const double* b, lapack_int ldb, const double* x,
+                           lapack_int ldx, double* ferr, double* berr );
+lapack_int LAPACKE_ctprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztprfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_stptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* ap );
+lapack_int LAPACKE_dtptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* ap );
+lapack_int LAPACKE_ctptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* ap,
+                           float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* ap,
+                           double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* ap,
+                           lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* ap,
+                           lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr( int matrix_order, char uplo, lapack_int n,
+                           const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr( int matrix_order, char uplo, lapack_int n,
+                           const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* rcond );
+lapack_int LAPACKE_dtrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* rcond );
+lapack_int LAPACKE_ctrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, float* rcond );
+lapack_int LAPACKE_ztrcon( int matrix_order, char norm, char uplo, char diag,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* rcond );
+
+lapack_int LAPACKE_strevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n, const float* t,
+                           lapack_int ldt, float* vl, lapack_int ldvl,
+                           float* vr, lapack_int ldvr, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_dtrevc( int matrix_order, char side, char howmny,
+                           lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, double* vl,
+                           lapack_int ldvl, double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ctrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* vl, lapack_int ldvl,
+                           lapack_complex_float* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrevc( int matrix_order, char side, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* vl, lapack_int ldvl,
+                           lapack_complex_double* vr, lapack_int ldvr,
+                           lapack_int mm, lapack_int* m );
+
+lapack_int LAPACKE_strexc( int matrix_order, char compq, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_dtrexc( int matrix_order, char compq, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           lapack_int* ifst, lapack_int* ilst );
+lapack_int LAPACKE_ctrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc( int matrix_order, char compq, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* b, lapack_int ldb,
+                           const float* x, lapack_int ldx, float* ferr,
+                           float* berr );
+lapack_int LAPACKE_dtrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* b, lapack_int ldb,
+                           const double* x, lapack_int ldx, double* ferr,
+                           double* berr );
+lapack_int LAPACKE_ctrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           const lapack_complex_float* x, lapack_int ldx,
+                           float* ferr, float* berr );
+lapack_int LAPACKE_ztrrfs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           const lapack_complex_double* x, lapack_int ldx,
+                           double* ferr, double* berr );
+
+lapack_int LAPACKE_strsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n, float* t,
+                           lapack_int ldt, float* q, lapack_int ldq, float* wr,
+                           float* wi, lapack_int* m, float* s, float* sep );
+lapack_int LAPACKE_dtrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           double* t, lapack_int ldt, double* q, lapack_int ldq,
+                           double* wr, double* wi, lapack_int* m, double* s,
+                           double* sep );
+lapack_int LAPACKE_ctrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* q, lapack_int ldq,
+                           lapack_complex_float* w, lapack_int* m, float* s,
+                           float* sep );
+lapack_int LAPACKE_ztrsen( int matrix_order, char job, char compq,
+                           const lapack_logical* select, lapack_int n,
+                           lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_complex_double* w, lapack_int* m, double* s,
+                           double* sep );
+
+lapack_int LAPACKE_strsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const float* t, lapack_int ldt, const float* vl,
+                           lapack_int ldvl, const float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_dtrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const double* t, lapack_int ldt, const double* vl,
+                           lapack_int ldvl, const double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+lapack_int LAPACKE_ctrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           const lapack_complex_float* vl, lapack_int ldvl,
+                           const lapack_complex_float* vr, lapack_int ldvr,
+                           float* s, float* sep, lapack_int mm, lapack_int* m );
+lapack_int LAPACKE_ztrsna( int matrix_order, char job, char howmny,
+                           const lapack_logical* select, lapack_int n,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           const lapack_complex_double* vl, lapack_int ldvl,
+                           const lapack_complex_double* vr, lapack_int ldvr,
+                           double* s, double* sep, lapack_int mm,
+                           lapack_int* m );
+
+lapack_int LAPACKE_strsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const float* a, lapack_int lda, const float* b,
+                           lapack_int ldb, float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_dtrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, const double* b,
+                           lapack_int ldb, double* c, lapack_int ldc,
+                           double* scale );
+lapack_int LAPACKE_ctrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* c, lapack_int ldc,
+                           float* scale );
+lapack_int LAPACKE_ztrsyl( int matrix_order, char trana, char tranb,
+                           lapack_int isgn, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* c, lapack_int ldc,
+                           double* scale );
+
+lapack_int LAPACKE_strtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztrtri( int matrix_order, char uplo, char diag, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs( int matrix_order, char uplo, char trans, char diag,
+                           lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const float* a, lapack_int lda,
+                           float* arf );
+lapack_int LAPACKE_dtrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const double* a, lapack_int lda,
+                           double* arf );
+lapack_int LAPACKE_ctrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf( int matrix_order, char transr, char uplo,
+                           lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp( int matrix_order, char uplo, lapack_int n,
+                           const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp( int matrix_order, char uplo, lapack_int n,
+                           const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           float* a, lapack_int lda, float* tau );
+lapack_int LAPACKE_dtzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           double* a, lapack_int lda, double* tau );
+lapack_int LAPACKE_ctzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau );
+lapack_int LAPACKE_ztzrzf( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungbr( int matrix_order, char vect, lapack_int m,
+                           lapack_int n, lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunghr( int matrix_order, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zunglq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungql( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungqr( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau );
+lapack_int LAPACKE_zungrq( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau );
+lapack_int LAPACKE_zungtr( int matrix_order, char uplo, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau );
+
+lapack_int LAPACKE_cunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmbr( int matrix_order, char vect, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmhr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int ilo,
+                           lapack_int ihi, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmlq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmql( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmqr( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrq( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmrz( int matrix_order, char side, char trans,
+                           lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* a, lapack_int lda,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zunmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_cupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* q, lapack_int ldq );
+lapack_int LAPACKE_zupgtr( int matrix_order, char uplo, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* q, lapack_int ldq );
+
+lapack_int LAPACKE_cupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_float* ap,
+                           const lapack_complex_float* tau,
+                           lapack_complex_float* c, lapack_int ldc );
+lapack_int LAPACKE_zupmtr( int matrix_order, char side, char uplo, char trans,
+                           lapack_int m, lapack_int n,
+                           const lapack_complex_double* ap,
+                           const lapack_complex_double* tau,
+                           lapack_complex_double* c, lapack_int ldc );
+
+lapack_int LAPACKE_sbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, float* d, float* e, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* q, lapack_int* iq, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbdsdc_work( int matrix_order, char uplo, char compq,
+                                lapack_int n, double* d, double* e, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* q, lapack_int* iq, double* work,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, float* vt, lapack_int ldvt,
+                                float* u, lapack_int ldu, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, double* vt,
+                                lapack_int ldvt, double* u, lapack_int ldu,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_cbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                float* d, float* e, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_zbdsqr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                                double* d, double* e, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sdisna_work( char job, lapack_int m, lapack_int n,
+                                const float* d, float* sep );
+lapack_int LAPACKE_ddisna_work( char job, lapack_int m, lapack_int n,
+                                const double* d, double* sep );
+
+lapack_int LAPACKE_sgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, float* ab, lapack_int ldab,
+                                float* d, float* e, float* q, lapack_int ldq,
+                                float* pt, lapack_int ldpt, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, double* ab, lapack_int ldab,
+                                double* d, double* e, double* q, lapack_int ldq,
+                                double* pt, lapack_int ldpt, double* c,
+                                lapack_int ldc, double* work );
+lapack_int LAPACKE_cgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_float* ab,
+                                lapack_int ldab, float* d, float* e,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* pt, lapack_int ldpt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbbrd_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int ncc, lapack_int kl,
+                                lapack_int ku, lapack_complex_double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* pt, lapack_int ldpt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbcon_work( int matrix_order, char norm, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* ab,
+                                lapack_int ldab, float* r, float* c,
+                                float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* r, double* c,
+                                double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_dgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku, const double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+lapack_int LAPACKE_cgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab, float* r, float* c,
+                                 float* rowcnd, float* colcnd, float* amax );
+lapack_int LAPACKE_zgbequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_int kl, lapack_int ku,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab, double* r, double* c,
+                                 double* rowcnd, double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const float* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgbrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const float* ab,
+                                 lapack_int ldab, const float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, const double* ab,
+                                 lapack_int ldab, const double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_float* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_float* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbrfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs,
+                                 const lapack_complex_double* ab,
+                                 lapack_int ldab,
+                                 const lapack_complex_double* afb,
+                                 lapack_int ldafb, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, float* ab,
+                               lapack_int ldab, lapack_int* ipiv, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs, double* ab,
+                               lapack_int ldab, lapack_int* ipiv, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgbsv_work( int matrix_order, lapack_int n, lapack_int kl,
+                               lapack_int ku, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, float* ab, lapack_int ldab,
+                                float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, float* r, float* c, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, double* ab, lapack_int ldab,
+                                double* afb, lapack_int ldafb, lapack_int* ipiv,
+                                char* equed, double* r, double* c, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                float* r, float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgbsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed,
+                                double* r, double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, float* ab, lapack_int ldab,
+                                 float* afb, lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, double* ab, lapack_int ldab,
+                                 double* afb, lapack_int ldafb,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_float* ab,
+                                 lapack_int ldab, lapack_complex_float* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, float* r, float* c,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgbsvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int kl, lapack_int ku,
+                                 lapack_int nrhs, lapack_complex_double* ab,
+                                 lapack_int ldab, lapack_complex_double* afb,
+                                 lapack_int ldafb, lapack_int* ipiv,
+                                 char* equed, double* r, double* c,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, float* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_dgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, double* ab,
+                                lapack_int ldab, lapack_int* ipiv );
+lapack_int LAPACKE_cgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgbtrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgbtrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int kl, lapack_int ku, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m, float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_dgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m, double* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_cgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* scale, lapack_int m,
+                                lapack_complex_float* v, lapack_int ldv );
+lapack_int LAPACKE_zgebak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* scale, lapack_int m,
+                                lapack_complex_double* v, lapack_int ldv );
+
+lapack_int LAPACKE_sgebal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, float* scale );
+lapack_int LAPACKE_dgebal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ilo,
+                                lapack_int* ihi, double* scale );
+lapack_int LAPACKE_cgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                float* scale );
+lapack_int LAPACKE_zgebal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* scale );
+
+lapack_int LAPACKE_sgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tauq, float* taup, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tauq, double* taup, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tauq,
+                                lapack_complex_float* taup,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgebrd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tauq,
+                                lapack_complex_double* taup,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgecon_work( int matrix_order, char norm, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgecon_work( int matrix_order, char norm, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgecon_work( int matrix_order, char norm, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, float* r,
+                                float* c, float* rowcnd, float* colcnd,
+                                float* amax );
+lapack_int LAPACKE_dgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* r,
+                                double* c, double* rowcnd, double* colcnd,
+                                double* amax );
+lapack_int LAPACKE_cgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequ_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* r, double* c, double* rowcnd,
+                                double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const float* a, lapack_int lda, float* r,
+                                 float* c, float* rowcnd, float* colcnd,
+                                 float* amax );
+lapack_int LAPACKE_dgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const double* a, lapack_int lda, double* r,
+                                 double* c, double* rowcnd, double* colcnd,
+                                 double* amax );
+lapack_int LAPACKE_cgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* r, float* c, float* rowcnd,
+                                 float* colcnd, float* amax );
+lapack_int LAPACKE_zgeequb_work( int matrix_order, lapack_int m, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* r, double* c, double* rowcnd,
+                                 double* colcnd, double* amax );
+
+lapack_int LAPACKE_sgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                               lapack_int lda, lapack_int* sdim, float* wr,
+                               float* wi, float* vs, lapack_int ldvs,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                               lapack_int lda, lapack_int* sdim, double* wr,
+                               double* wi, double* vs, lapack_int ldvs,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_C_SELECT1 select, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_float* w,
+                               lapack_complex_float* vs, lapack_int ldvs,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgees_work( int matrix_order, char jobvs, char sort,
+                               LAPACK_Z_SELECT1 select, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* sdim, lapack_complex_double* w,
+                               lapack_complex_double* vs, lapack_int ldvs,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_S_SELECT2 select, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                lapack_int* sdim, float* wr, float* wi,
+                                float* vs, lapack_int ldvs, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_D_SELECT2 select, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                lapack_int* sdim, double* wr, double* wi,
+                                double* vs, lapack_int ldvs, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_C_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vs, lapack_int ldvs,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgeesx_work( int matrix_order, char jobvs, char sort,
+                                LAPACK_Z_SELECT1 select, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* sdim,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vs, lapack_int ldvs,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda,
+                               float* wr, float* wi, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* wr, double* wi, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* w,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zgeev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* w,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* wr, float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* wr, double* wi,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* scale, double* abnrm,
+                                double* rconde, double* rcondv, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* scale,
+                                float* abnrm, float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, double* scale,
+                                double* abnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgehrd_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgejsv_work( int matrix_order, char joba, char jobu,
+                                char jobv, char jobr, char jobt, char jobp,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* sva, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgelq2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgelqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgels_work( int matrix_order, char trans, lapack_int m,
+                               lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zgelsd_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* s,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float* s, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelss_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double* s, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* jpvt,
+                                float rcond, lapack_int* rank, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* jpvt,
+                                double rcond, lapack_int* rank, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_cgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* jpvt, float rcond,
+                                lapack_int* rank, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgelsy_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* jpvt, double rcond,
+                                lapack_int* rank, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqlf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zgeqp3_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_sgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* jpvt,
+                                float* tau, float* work );
+lapack_int LAPACKE_dgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* jpvt,
+                                double* tau, double* work );
+lapack_int LAPACKE_cgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_float* tau,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgeqpf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* jpvt, lapack_complex_double* tau,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work );
+lapack_int LAPACKE_dgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work );
+lapack_int LAPACKE_cgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zgeqr2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* tau,
+                                 float* work, lapack_int lwork );
+lapack_int LAPACKE_dgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* tau,
+                                 double* work, lapack_int lwork );
+lapack_int LAPACKE_cgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* tau,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgeqrfp_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* tau,
+                                 lapack_complex_double* work,
+                                 lapack_int lwork );
+
+lapack_int LAPACKE_sgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgerfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c, const float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* r, const float* c,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zgerfsx_work( int matrix_order, char trans, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* r, const double* c,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgerqf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* s, float* u, lapack_int ldu, float* vt,
+                                lapack_int ldvt, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* s, double* u, lapack_int ldu,
+                                double* vt, lapack_int ldvt, double* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_cgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* s,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* vt, lapack_int ldvt,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork );
+lapack_int LAPACKE_zgesdd_work( int matrix_order, char jobz, lapack_int m,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* s,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* vt, lapack_int ldvt,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork );
+
+lapack_int LAPACKE_sgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* a, lapack_int lda, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* a, lapack_int lda, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcgesv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* s, float* u,
+                                lapack_int ldu, float* vt, lapack_int ldvt,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* s, double* u,
+                                lapack_int ldu, double* vt, lapack_int ldvt,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* s, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* vt,
+                                lapack_int ldvt, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zgesvd_work( int matrix_order, char jobu, char jobvt,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* s, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* vt,
+                                lapack_int ldvt, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_sgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* sva, lapack_int mv,
+                                float* v, lapack_int ldv, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dgesvj_work( int matrix_order, char joba, char jobu,
+                                char jobv, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* sva,
+                                lapack_int mv, double* v, lapack_int ldv,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r,
+                                float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zgesvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r,
+                                double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, double* b, lapack_int ldb,
+                                 double* x, lapack_int ldx, double* rcond,
+                                 double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* r,
+                                 float* c, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgesvxx_work( int matrix_order, char fact, char trans,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* r,
+                                 double* c, lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetf2_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_dgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv );
+lapack_int LAPACKE_cgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv );
+lapack_int LAPACKE_zgetrf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv );
+
+lapack_int LAPACKE_sgetri_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetri_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetri_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgetrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, float* v, lapack_int ldv );
+lapack_int LAPACKE_dggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, double* v, lapack_int ldv );
+lapack_int LAPACKE_cggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const float* lscale, const float* rscale,
+                                lapack_int m, lapack_complex_float* v,
+                                lapack_int ldv );
+lapack_int LAPACKE_zggbak_work( int matrix_order, char job, char side,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                const double* lscale, const double* rscale,
+                                lapack_int m, lapack_complex_double* v,
+                                lapack_int ldv );
+
+lapack_int LAPACKE_sggbal_work( int matrix_order, char job, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* work );
+lapack_int LAPACKE_dggbal_work( int matrix_order, char job, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* work );
+lapack_int LAPACKE_cggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* work );
+lapack_int LAPACKE_zggbal_work( int matrix_order, char job, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* work );
+
+lapack_int LAPACKE_sgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_S_SELECT3 selctg, lapack_int n,
+                               float* a, lapack_int lda, float* b,
+                               lapack_int ldb, lapack_int* sdim, float* alphar,
+                               float* alphai, float* beta, float* vsl,
+                               lapack_int ldvsl, float* vsr, lapack_int ldvsr,
+                               float* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_dgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_D_SELECT3 selctg, lapack_int n,
+                               double* a, lapack_int lda, double* b,
+                               lapack_int ldb, lapack_int* sdim, double* alphar,
+                               double* alphai, double* beta, double* vsl,
+                               lapack_int ldvsl, double* vsr, lapack_int ldvsr,
+                               double* work, lapack_int lwork,
+                               lapack_logical* bwork );
+lapack_int LAPACKE_cgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_C_SELECT2 selctg, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vsl, lapack_int ldvsl,
+                               lapack_complex_float* vsr, lapack_int ldvsr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_logical* bwork );
+lapack_int LAPACKE_zgges_work( int matrix_order, char jobvsl, char jobvsr,
+                               char sort, LAPACK_Z_SELECT2 selctg, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_int* sdim, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vsl, lapack_int ldvsl,
+                               lapack_complex_double* vsr, lapack_int ldvsr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_S_SELECT3 selctg, char sense,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, lapack_int* sdim,
+                                float* alphar, float* alphai, float* beta,
+                                float* vsl, lapack_int ldvsl, float* vsr,
+                                lapack_int ldvsr, float* rconde, float* rcondv,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_dggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_D_SELECT3 selctg, char sense,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, lapack_int* sdim,
+                                double* alphar, double* alphai, double* beta,
+                                double* vsl, lapack_int ldvsl, double* vsr,
+                                lapack_int ldvsr, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_cggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_C_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vsl, lapack_int ldvsl,
+                                lapack_complex_float* vsr, lapack_int ldvsr,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+lapack_int LAPACKE_zggesx_work( int matrix_order, char jobvsl, char jobvsr,
+                                char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_int* sdim,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vsl, lapack_int ldvsl,
+                                lapack_complex_double* vsr, lapack_int ldvsr,
+                                double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int liwork, lapack_logical* bwork );
+
+lapack_int LAPACKE_sggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float* alphar, float* alphai,
+                               float* beta, float* vl, lapack_int ldvl,
+                               float* vr, lapack_int ldvr, float* work,
+                               lapack_int lwork );
+lapack_int LAPACKE_dggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* alphar,
+                               double* alphai, double* beta, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_cggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* alpha,
+                               lapack_complex_float* beta,
+                               lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zggev_work( int matrix_order, char jobvl, char jobvr,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* alpha,
+                               lapack_complex_double* beta,
+                               lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_sggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* alphar, float* alphai, float* beta,
+                                float* vl, lapack_int ldvl, float* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, float* lscale, float* rscale,
+                                float* abnrm, float* bbnrm, float* rconde,
+                                float* rcondv, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_dggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* alphar, double* alphai, double* beta,
+                                double* vl, lapack_int ldvl, double* vr,
+                                lapack_int ldvr, lapack_int* ilo,
+                                lapack_int* ihi, double* lscale, double* rscale,
+                                double* abnrm, double* bbnrm, double* rconde,
+                                double* rcondv, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_logical* bwork );
+lapack_int LAPACKE_cggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi, float* lscale,
+                                float* rscale, float* abnrm, float* bbnrm,
+                                float* rconde, float* rcondv,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+lapack_int LAPACKE_zggevx_work( int matrix_order, char balanc, char jobvl,
+                                char jobvr, char sense, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int* ilo, lapack_int* ihi,
+                                double* lscale, double* rscale, double* abnrm,
+                                double* bbnrm, double* rconde, double* rcondv,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int* iwork,
+                                lapack_logical* bwork );
+
+lapack_int LAPACKE_sggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* d, float* x,
+                                float* y, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* d, double* x,
+                                double* y, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* y,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggglm_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* y,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* q, lapack_int ldq,
+                                float* z, lapack_int ldz );
+lapack_int LAPACKE_dgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz );
+lapack_int LAPACKE_cgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz );
+lapack_int LAPACKE_zgghrd_work( int matrix_order, char compq, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz );
+
+lapack_int LAPACKE_sgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float* c, float* d,
+                                float* x, float* work, lapack_int lwork );
+lapack_int LAPACKE_dgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* c, double* d,
+                                double* x, double* work, lapack_int lwork );
+lapack_int LAPACKE_cgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* c,
+                                lapack_complex_float* d,
+                                lapack_complex_float* x,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgglse_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* c,
+                                lapack_complex_double* d,
+                                lapack_complex_double* x,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggqrf_work( int matrix_order, lapack_int n, lapack_int m,
+                                lapack_int p, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* taua, float* b, lapack_int ldb,
+                                float* taub, float* work, lapack_int lwork );
+lapack_int LAPACKE_dggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* taua, double* b, lapack_int ldb,
+                                double* taub, double* work, lapack_int lwork );
+lapack_int LAPACKE_cggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* taua,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* taub,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zggrqf_work( int matrix_order, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* taua,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* taub,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alpha, float* beta,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alpha, double* beta,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* alpha, float* beta,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_zggsvd_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int n,
+                                lapack_int p, lapack_int* k, lapack_int* l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* alpha, double* beta,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_sggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, float tola,
+                                float tolb, lapack_int* k, lapack_int* l,
+                                float* u, lapack_int ldu, float* v,
+                                lapack_int ldv, float* q, lapack_int ldq,
+                                lapack_int* iwork, float* tau, float* work );
+lapack_int LAPACKE_dggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double tola,
+                                double tolb, lapack_int* k, lapack_int* l,
+                                double* u, lapack_int ldu, double* v,
+                                lapack_int ldv, double* q, lapack_int ldq,
+                                lapack_int* iwork, double* tau, double* work );
+lapack_int LAPACKE_cggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_float* u, lapack_int ldu,
+                                lapack_complex_float* v, lapack_int ldv,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int* iwork, float* rwork,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zggsvp_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                lapack_int* k, lapack_int* l,
+                                lapack_complex_double* u, lapack_int ldu,
+                                lapack_complex_double* v, lapack_int ldv,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int* iwork, double* rwork,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtcon_work( char norm, lapack_int n, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtcon_work( char norm, lapack_int n, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zgtcon_work( char norm, lapack_int n,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* dlf, const float* df,
+                                const float* duf, const float* du2,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* dlf, const double* df,
+                                const double* duf, const double* du2,
+                                const lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* dlf,
+                                const lapack_complex_float* df,
+                                const lapack_complex_float* duf,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtrfs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* dlf,
+                                const lapack_complex_double* df,
+                                const lapack_complex_double* duf,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* dl, float* d, float* du, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* dl, double* d, double* du, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* dl,
+                               lapack_complex_float* d,
+                               lapack_complex_float* du,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zgtsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* dl,
+                               lapack_complex_double* d,
+                               lapack_complex_double* du,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const float* dl,
+                                const float* d, const float* du, float* dlf,
+                                float* df, float* duf, float* du2,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs, const double* dl,
+                                const double* d, const double* du, double* dlf,
+                                double* df, double* duf, double* du2,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                lapack_complex_float* dlf,
+                                lapack_complex_float* df,
+                                lapack_complex_float* duf,
+                                lapack_complex_float* du2, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zgtsvx_work( int matrix_order, char fact, char trans,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                lapack_complex_double* dlf,
+                                lapack_complex_double* df,
+                                lapack_complex_double* duf,
+                                lapack_complex_double* du2, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sgttrf_work( lapack_int n, float* dl, float* d, float* du,
+                                float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_dgttrf_work( lapack_int n, double* dl, double* d, double* du,
+                                double* du2, lapack_int* ipiv );
+lapack_int LAPACKE_cgttrf_work( lapack_int n, lapack_complex_float* dl,
+                                lapack_complex_float* d,
+                                lapack_complex_float* du,
+                                lapack_complex_float* du2, lapack_int* ipiv );
+lapack_int LAPACKE_zgttrf_work( lapack_int n, lapack_complex_double* dl,
+                                lapack_complex_double* d,
+                                lapack_complex_double* du,
+                                lapack_complex_double* du2, lapack_int* ipiv );
+
+lapack_int LAPACKE_sgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const float* dl,
+                                const float* d, const float* du,
+                                const float* du2, const lapack_int* ipiv,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const double* dl,
+                                const double* d, const double* du,
+                                const double* du2, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* dl,
+                                const lapack_complex_float* d,
+                                const lapack_complex_float* du,
+                                const lapack_complex_float* du2,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zgttrs_work( int matrix_order, char trans, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* dl,
+                                const lapack_complex_double* d,
+                                const lapack_complex_double* du,
+                                const lapack_complex_double* du2,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* q, lapack_int ldq,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                float* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_zhbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* q, lapack_int ldq,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                double* rwork, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_chbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* bb, lapack_int ldbb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                const lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_chbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* bb, lapack_int ldbb,
+                               float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work,
+                               float* rwork );
+lapack_int LAPACKE_zhbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* bb, lapack_int ldbb,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* bb, lapack_int ldbb,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* bb, lapack_int ldbb,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* bb,
+                                lapack_int ldbb, lapack_complex_float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* bb,
+                                lapack_int ldbb, lapack_complex_double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                float* d, float* e, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work );
+lapack_int LAPACKE_zhbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                double* d, double* e, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work );
+
+lapack_int LAPACKE_checon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhecon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_cheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zheequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_cheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zheev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, double* w,
+                               lapack_complex_double* work, lapack_int lwork,
+                               double* rwork );
+
+lapack_int LAPACKE_cheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, float* w,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, double* w,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zheevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zheevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhegst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_chegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, float* w,
+                               lapack_complex_float* work, lapack_int lwork,
+                               float* rwork );
+lapack_int LAPACKE_zhegv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb,
+                               double* w, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork );
+
+lapack_int LAPACKE_chegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float* w, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhegvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double* w, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhegvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_cherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zherfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_cherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zherfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhesv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zhesvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zhesvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_chetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* d, float* e, lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhetrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* d, double* e,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_chetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zhetrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_chetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhetri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_chfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const lapack_complex_float* a,
+                               lapack_int lda, float beta,
+                               lapack_complex_float* c );
+lapack_int LAPACKE_zhfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const lapack_complex_double* a,
+                               lapack_int lda, double beta,
+                               lapack_complex_double* c );
+
+lapack_int LAPACKE_shgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* h, lapack_int ldh,
+                                float* t, lapack_int ldt, float* alphar,
+                                float* alphai, float* beta, float* q,
+                                lapack_int ldq, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* h, lapack_int ldh,
+                                double* t, lapack_int ldt, double* alphar,
+                                double* alphai, double* beta, double* q,
+                                lapack_int ldq, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork );
+lapack_int LAPACKE_zhgeqz_work( int matrix_order, char job, char compq,
+                                char compz, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_chpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zhpcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_chpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_float* ap, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_complex_double* ap,
+                               double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work,
+                               double* rwork );
+
+lapack_int LAPACKE_chpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                float* w, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zhpevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                double* w, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_chpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_float* ap,
+                                const lapack_complex_float* bp );
+lapack_int LAPACKE_zhpgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, lapack_complex_double* ap,
+                                const lapack_complex_double* bp );
+
+lapack_int LAPACKE_chpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_float* ap,
+                               lapack_complex_float* bp, float* w,
+                               lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n,
+                               lapack_complex_double* ap,
+                               lapack_complex_double* bp, double* w,
+                               lapack_complex_double* z, lapack_int ldz,
+                               lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_zhpgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_chpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* bp, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_zhpgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* bp, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_chprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zhpsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_chpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zhpsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_chptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, float* d, float* e,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zhptrd_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, double* d, double* e,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_chptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zhptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_chptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zhptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_chptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zhptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_shsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const float* h, lapack_int ldh,
+                                float* wr, const float* wi, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_dhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, lapack_logical* select,
+                                lapack_int n, const double* h, lapack_int ldh,
+                                double* wr, const double* wi, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_chsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_float* h,
+                                lapack_int ldh, lapack_complex_float* w,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+lapack_int LAPACKE_zhsein_work( int matrix_order, char job, char eigsrc,
+                                char initv, const lapack_logical* select,
+                                lapack_int n, const lapack_complex_double* h,
+                                lapack_int ldh, lapack_complex_double* w,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork,
+                                lapack_int* ifaill, lapack_int* ifailr );
+
+lapack_int LAPACKE_shseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                float* h, lapack_int ldh, float* wr, float* wi,
+                                float* z, lapack_int ldz, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                double* h, lapack_int ldh, double* wr,
+                                double* wi, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_chseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_float* h, lapack_int ldh,
+                                lapack_complex_float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zhseqr_work( int matrix_order, char job, char compz,
+                                lapack_int n, lapack_int ilo, lapack_int ihi,
+                                lapack_complex_double* h, lapack_int ldh,
+                                lapack_complex_double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_clacgv_work( lapack_int n, lapack_complex_float* x,
+                                lapack_int incx );
+lapack_int LAPACKE_zlacgv_work( lapack_int n, lapack_complex_double* x,
+                                lapack_int incx );
+
+lapack_int LAPACKE_slacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_clacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zlacpy_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_zlag2c_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_float* sa, lapack_int ldsa );
+
+lapack_int LAPACKE_slag2d_work( int matrix_order, lapack_int m, lapack_int n,
+                                const float* sa, lapack_int ldsa, double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_dlag2s_work( int matrix_order, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, float* sa,
+                                lapack_int ldsa );
+
+lapack_int LAPACKE_clag2z_work( int matrix_order, lapack_int m, lapack_int n,
+                                const lapack_complex_float* sa, lapack_int ldsa,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                float* a, lapack_int lda, lapack_int* iseed,
+                                float* work );
+lapack_int LAPACKE_dlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                double* a, lapack_int lda, lapack_int* iseed,
+                                double* work );
+lapack_int LAPACKE_clagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const float* d,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* iseed, lapack_complex_float* work );
+lapack_int LAPACKE_zlagge_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int kl, lapack_int ku, const double* d,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* iseed,
+                                lapack_complex_double* work );
+                                
+lapack_int LAPACKE_claghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlaghe_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, float* a, lapack_int lda,
+                                lapack_int* iseed, float* work );
+lapack_int LAPACKE_dlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, double* a, lapack_int lda,
+                                lapack_int* iseed, double* work );
+lapack_int LAPACKE_clagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const float* d, lapack_complex_float* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlagsy_work( int matrix_order, lapack_int n, lapack_int k,
+                                const double* d, lapack_complex_double* a,
+                                lapack_int lda, lapack_int* iseed,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, float* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_dlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n, double* x,
+                                lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* x, lapack_int ldx,
+                                lapack_int* k );
+lapack_int LAPACKE_zlapmr_work( int matrix_order, lapack_logical forwrd,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* x, lapack_int ldx,
+                                lapack_int* k );
+
+lapack_int LAPACKE_slartgp_work( float f, float g, float* cs, float* sn,
+                                 float* r );
+lapack_int LAPACKE_dlartgp_work( double f, double g, double* cs, double* sn,
+                                 double* r );
+
+lapack_int LAPACKE_slartgs_work( float x, float y, float sigma, float* cs,
+                                 float* sn );
+lapack_int LAPACKE_dlartgs_work( double x, double y, double sigma, double* cs,
+                                 double* sn );
+                                
+float LAPACKE_slapy2_work( float x, float y );
+double LAPACKE_dlapy2_work( double x, double y );
+
+float LAPACKE_slapy3_work( float x, float y, float z );
+double LAPACKE_dlapy3_work( double x, double y, double z );
+
+float LAPACKE_slamch_work( char cmach );
+double LAPACKE_dlamch_work( char cmach );
+
+float LAPACKE_slange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlange_work( int matrix_order, char norm, lapack_int m,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_clanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlanhe_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_dlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* work );
+float LAPACKE_clansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_zlansy_work( int matrix_order, char norm, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* work );
+
+float LAPACKE_slantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, float* work );
+double LAPACKE_dlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda, double* work );
+float LAPACKE_clantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* work );
+double LAPACKE_zlantr_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* work );
+
+lapack_int LAPACKE_slarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* t, lapack_int ldt,
+                                float* c, lapack_int ldc, float* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_dlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* t, lapack_int ldt,
+                                double* c, lapack_int ldc, double* work,
+                                lapack_int ldwork );
+lapack_int LAPACKE_clarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int ldwork );
+lapack_int LAPACKE_zlarfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work,
+                                lapack_int ldwork );
+
+lapack_int LAPACKE_slarfg_work( lapack_int n, float* alpha, float* x,
+                                lapack_int incx, float* tau );
+lapack_int LAPACKE_dlarfg_work( lapack_int n, double* alpha, double* x,
+                                lapack_int incx, double* tau );
+lapack_int LAPACKE_clarfg_work( lapack_int n, lapack_complex_float* alpha,
+                                lapack_complex_float* x, lapack_int incx,
+                                lapack_complex_float* tau );
+lapack_int LAPACKE_zlarfg_work( lapack_int n, lapack_complex_double* alpha,
+                                lapack_complex_double* x, lapack_int incx,
+                                lapack_complex_double* tau );
+
+lapack_int LAPACKE_slarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const float* v,
+                                lapack_int ldv, const float* tau, float* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_dlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k, const double* v,
+                                lapack_int ldv, const double* tau, double* t,
+                                lapack_int ldt );
+lapack_int LAPACKE_clarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zlarft_work( int matrix_order, char direct, char storev,
+                                lapack_int n, lapack_int k,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_slarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const float* v, float tau,
+                                float* c, lapack_int ldc, float* work );
+lapack_int LAPACKE_dlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const double* v, double tau,
+                                double* c, lapack_int ldc, double* work );
+lapack_int LAPACKE_clarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_float* v,
+                                lapack_complex_float tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zlarfx_work( int matrix_order, char side, lapack_int m,
+                                lapack_int n, const lapack_complex_double* v,
+                                lapack_complex_double tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_slarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, float* x );
+lapack_int LAPACKE_dlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, double* x );
+lapack_int LAPACKE_clarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_float* x );
+lapack_int LAPACKE_zlarnv_work( lapack_int idist, lapack_int* iseed,
+                                lapack_int n, lapack_complex_double* x );
+
+lapack_int LAPACKE_slaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, float alpha, float beta, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, double alpha, double beta,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_claset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_float alpha,
+                                lapack_complex_float beta,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlaset_work( int matrix_order, char uplo, lapack_int m,
+                                lapack_int n, lapack_complex_double alpha,
+                                lapack_complex_double beta,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_slasrt_work( char id, lapack_int n, float* d );
+lapack_int LAPACKE_dlasrt_work( char id, lapack_int n, double* d );
+
+lapack_int LAPACKE_slaswp_work( int matrix_order, lapack_int n, float* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_dlaswp_work( int matrix_order, lapack_int n, double* a,
+                                lapack_int lda, lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_claswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+lapack_int LAPACKE_zlaswp_work( int matrix_order, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int k1, lapack_int k2,
+                                const lapack_int* ipiv, lapack_int incx );
+
+lapack_int LAPACKE_slatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, float* a, lapack_int lda,
+                                float* work );
+lapack_int LAPACKE_dlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, double* a, lapack_int lda,
+                                double* work );
+lapack_int LAPACKE_clatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                float* d, lapack_int mode, float cond,
+                                float dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* work );
+lapack_int LAPACKE_zlatms_work( int matrix_order, lapack_int m, lapack_int n,
+                                char dist, lapack_int* iseed, char sym,
+                                double* d, lapack_int mode, double cond,
+                                double dmax, lapack_int kl, lapack_int ku,
+                                char pack, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* work );
+
+lapack_int LAPACKE_slauum_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dlauum_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_clauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zlauum_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_sopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const float* tau, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dopgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const double* tau, double* q,
+                                lapack_int ldq, double* work );
+
+lapack_int LAPACKE_sopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* ap, const float* tau, float* c,
+                                lapack_int ldc, float* work );
+lapack_int LAPACKE_dopmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* ap, const double* tau, double* c,
+                                lapack_int ldc, double* work );
+
+lapack_int LAPACKE_sorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, float* a,
+                                lapack_int lda, const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k, double* a,
+                                lapack_int lda, const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, float* a, lapack_int lda,
+                                const float* tau, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorgrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, double* a, lapack_int lda,
+                                const double* tau, double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_sorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, const float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtr_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, const double* tau,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_sormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const float* tau, float* c, lapack_int ldc,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dormtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* tau, double* c, lapack_int ldc,
+                                double* work, lapack_int lwork );
+
+lapack_int LAPACKE_spbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbcon_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const float* ab, lapack_int ldab,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_dpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_float* ab,
+                                lapack_int ldab, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_zpbequ_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, const lapack_complex_double* ab,
+                                lapack_int ldab, double* s, double* scond,
+                                double* amax );
+
+lapack_int LAPACKE_spbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* afb,
+                                lapack_int ldafb, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab,
+                                const double* afb, lapack_int ldafb,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb,
+                                lapack_int ldafb, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpbrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab,
+                                const lapack_complex_double* afb,
+                                lapack_int ldafb,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, float* bb, lapack_int ldbb );
+lapack_int LAPACKE_dpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, double* bb, lapack_int ldbb );
+lapack_int LAPACKE_cpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_float* bb,
+                                lapack_int ldbb );
+lapack_int LAPACKE_zpbstf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kb, lapack_complex_double* bb,
+                                lapack_int ldbb );
+
+lapack_int LAPACKE_spbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, float* ab,
+                               lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs, double* ab,
+                               lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int kd, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                float* ab, lapack_int ldab, float* afb,
+                                lapack_int ldafb, char* equed, float* s,
+                                float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                double* ab, lapack_int ldab, double* afb,
+                                lapack_int ldafb, char* equed, double* s,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_cpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* afb, lapack_int ldafb,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zpbsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int kd, lapack_int nrhs,
+                                lapack_complex_double* ab, lapack_int ldab,
+                                lapack_complex_double* afb, lapack_int ldafb,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, float* ab, lapack_int ldab );
+lapack_int LAPACKE_dpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, double* ab, lapack_int ldab );
+lapack_int LAPACKE_cpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_float* ab,
+                                lapack_int ldab );
+lapack_int LAPACKE_zpbtrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_complex_double* ab,
+                                lapack_int ldab );
+
+lapack_int LAPACKE_spbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const double* ab, lapack_int ldab, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpbtrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int kd, lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_spftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftrf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, float* a );
+lapack_int LAPACKE_dpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, double* a );
+lapack_int LAPACKE_cpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_float* a );
+lapack_int LAPACKE_zpftri_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_complex_double* a );
+
+lapack_int LAPACKE_spftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_cpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpftrs_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spocon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float anorm, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpocon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double anorm, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spoequ_work( int matrix_order, lapack_int n, const float* a,
+                                lapack_int lda, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dpoequ_work( int matrix_order, lapack_int n, const double* a,
+                                lapack_int lda, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequ_work( int matrix_order, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_spoequb_work( int matrix_order, lapack_int n, const float* a,
+                                 lapack_int lda, float* s, float* scond,
+                                 float* amax );
+lapack_int LAPACKE_dpoequb_work( int matrix_order, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax );
+lapack_int LAPACKE_cpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax );
+lapack_int LAPACKE_zpoequb_work( int matrix_order, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax );
+
+lapack_int LAPACKE_sporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zporfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const float* s,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zporfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_sposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zposv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dsposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, double* a, lapack_int lda,
+                                double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* work, float* swork,
+                                lapack_int* iter );
+lapack_int LAPACKE_zcposv_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* work,
+                                lapack_complex_float* swork, double* rwork,
+                                lapack_int* iter );
+
+lapack_int LAPACKE_sposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                char* equed, float* s, float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* rcond,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                char* equed, float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zposvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                char* equed, double* s,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 char* equed, float* s, float* b,
+                                 lapack_int ldb, float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 char* equed, double* s, double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_cposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 char* equed, float* s, lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params,
+                                 lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zposvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_spotrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda );
+lapack_int LAPACKE_dpotri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda );
+lapack_int LAPACKE_cpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zpotri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_spotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                float* b, lapack_int ldb );
+lapack_int LAPACKE_dpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, double* b, lapack_int ldb );
+lapack_int LAPACKE_cpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zpotrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_sppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float anorm, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double anorm, double* rcond,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float anorm,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double anorm,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_sppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* s, float* scond,
+                                float* amax );
+lapack_int LAPACKE_dppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* s, double* scond,
+                                double* amax );
+lapack_int LAPACKE_cppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap, float* s,
+                                float* scond, float* amax );
+lapack_int LAPACKE_zppequ_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap, double* s,
+                                double* scond, double* amax );
+
+lapack_int LAPACKE_spprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zpprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_dppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zppsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, float* ap,
+                                float* afp, char* equed, float* s, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, double* ap,
+                                double* afp, char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* ap,
+                                lapack_complex_float* afp, char* equed,
+                                float* s, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_zppsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* ap,
+                                lapack_complex_double* afp, char* equed,
+                                double* s, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_spptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap );
+lapack_int LAPACKE_dpptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap );
+lapack_int LAPACKE_cpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_zpptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_spptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_spstrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, float tol, float* work );
+lapack_int LAPACKE_dpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* piv,
+                                lapack_int* rank, double tol, double* work );
+lapack_int LAPACKE_cpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, float tol,
+                                float* work );
+lapack_int LAPACKE_zpstrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* piv, lapack_int* rank, double tol,
+                                double* work );
+
+lapack_int LAPACKE_sptcon_work( lapack_int n, const float* d, const float* e,
+                                float anorm, float* rcond, float* work );
+lapack_int LAPACKE_dptcon_work( lapack_int n, const double* d, const double* e,
+                                double anorm, double* rcond, double* work );
+lapack_int LAPACKE_cptcon_work( lapack_int n, const float* d,
+                                const lapack_complex_float* e, float anorm,
+                                float* rcond, float* work );
+lapack_int LAPACKE_zptcon_work( lapack_int n, const double* d,
+                                const lapack_complex_double* e, double anorm,
+                                double* rcond, double* work );
+
+lapack_int LAPACKE_spteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_cpteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zpteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, const float* df,
+                                const float* ef, const float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work );
+lapack_int LAPACKE_dptrfs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e,
+                                const double* df, const double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work );
+lapack_int LAPACKE_cptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, const float* df,
+                                const lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                const double* df,
+                                const lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, float* e, float* b, lapack_int ldb );
+lapack_int LAPACKE_dptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, double* e, double* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_cptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               float* d, lapack_complex_float* e,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zptsv_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                               double* d, lapack_complex_double* e,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d, const float* e,
+                                float* df, float* ef, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work );
+lapack_int LAPACKE_dptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const double* e, double* df, double* ef,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* rcond, double* ferr,
+                                double* berr, double* work );
+lapack_int LAPACKE_cptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e, float* df,
+                                lapack_complex_float* ef,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zptsvx_work( int matrix_order, char fact, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e, double* df,
+                                lapack_complex_double* ef,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_spttrf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dpttrf_work( lapack_int n, double* d, double* e );
+lapack_int LAPACKE_cpttrf_work( lapack_int n, float* d,
+                                lapack_complex_float* e );
+lapack_int LAPACKE_zpttrf_work( lapack_int n, double* d,
+                                lapack_complex_double* e );
+
+lapack_int LAPACKE_spttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const float* d, const float* e, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dpttrs_work( int matrix_order, lapack_int n, lapack_int nrhs,
+                                const double* d, const double* e, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_cpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* d,
+                                const lapack_complex_float* e,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zpttrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* d,
+                                const lapack_complex_double* e,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, float* ab,
+                               lapack_int ldab, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int kd, double* ab,
+                               lapack_int ldab, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                float* ab, lapack_int ldab, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int kd,
+                                double* ab, lapack_int ldab, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, const float* bb,
+                                lapack_int ldbb, float* x, lapack_int ldx,
+                                float* work );
+lapack_int LAPACKE_dsbgst_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, const double* bb,
+                                lapack_int ldbb, double* x, lapack_int ldx,
+                                double* work );
+
+lapack_int LAPACKE_ssbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               float* ab, lapack_int ldab, float* bb,
+                               lapack_int ldbb, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dsbgv_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, lapack_int ka, lapack_int kb,
+                               double* ab, lapack_int ldab, double* bb,
+                               lapack_int ldbb, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                float* ab, lapack_int ldab, float* bb,
+                                lapack_int ldbb, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsbgvd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, lapack_int ka, lapack_int kb,
+                                double* ab, lapack_int ldab, double* bb,
+                                lapack_int ldbb, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, float* ab, lapack_int ldab,
+                                float* bb, lapack_int ldbb, float* q,
+                                lapack_int ldq, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsbgvx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, lapack_int ka,
+                                lapack_int kb, double* ab, lapack_int ldab,
+                                double* bb, lapack_int ldbb, double* q,
+                                lapack_int ldq, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, float* ab,
+                                lapack_int ldab, float* d, float* e, float* q,
+                                lapack_int ldq, float* work );
+lapack_int LAPACKE_dsbtrd_work( int matrix_order, char vect, char uplo,
+                                lapack_int n, lapack_int kd, double* ab,
+                                lapack_int ldab, double* d, double* e,
+                                double* q, lapack_int ldq, double* work );
+
+lapack_int LAPACKE_ssfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               float alpha, const float* a, lapack_int lda,
+                               float beta, float* c );
+lapack_int LAPACKE_dsfrk_work( int matrix_order, char transr, char uplo,
+                               char trans, lapack_int n, lapack_int k,
+                               double alpha, const double* a, lapack_int lda,
+                               double beta, double* c );
+
+lapack_int LAPACKE_sspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, const lapack_int* ipiv,
+                                float anorm, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, const lapack_int* ipiv,
+                                double anorm, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_cspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zspcon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_sspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* ap, float* w, float* z,
+                               lapack_int ldz, float* work );
+lapack_int LAPACKE_dspev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* ap, double* w, double* z,
+                               lapack_int ldz, double* work );
+
+lapack_int LAPACKE_sspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* ap, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dspevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* ap, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* ap, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dspevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* ap, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_sspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* ap, const float* bp );
+lapack_int LAPACKE_dspgst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* ap, const double* bp );
+
+lapack_int LAPACKE_sspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* ap, float* bp,
+                               float* w, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dspgv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* ap, double* bp,
+                               double* w, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* ap, float* bp,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dspgvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* ap, double* bp,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* ap,
+                                float* bp, float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dspgvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* ap,
+                                double* bp, double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifail );
+
+lapack_int LAPACKE_ssprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const float* afp, const lapack_int* ipiv,
+                                const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const double* afp, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_complex_float* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsprfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* afp,
+                                const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_sspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* ap, lapack_int* ipiv,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* ap, lapack_int* ipiv,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_cspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* ap,
+                               lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb );
+lapack_int LAPACKE_zspsv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* ap,
+                               lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb );
+
+lapack_int LAPACKE_sspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* ap,
+                                float* afp, lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* ap,
+                                double* afp, lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_cspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* afp, lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zspsvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* afp, lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssptrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, float* d, float* e, float* tau );
+lapack_int LAPACKE_dsptrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, double* d, double* e, double* tau );
+
+lapack_int LAPACKE_ssptrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_dsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, lapack_int* ipiv );
+lapack_int LAPACKE_csptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap, lapack_int* ipiv );
+lapack_int LAPACKE_zsptrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap, lapack_int* ipiv );
+
+lapack_int LAPACKE_ssptri_work( int matrix_order, char uplo, lapack_int n,
+                                float* ap, const lapack_int* ipiv,
+                                float* work );
+lapack_int LAPACKE_dsptri_work( int matrix_order, char uplo, lapack_int n,
+                                double* ap, const lapack_int* ipiv,
+                                double* work );
+lapack_int LAPACKE_csptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsptri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* ap,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* ap,
+                                const lapack_int* ipiv, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_csptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* ap,
+                                const lapack_int* ipiv, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_zsptrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_sstebz_work( char range, char order, lapack_int n, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, const float* d, const float* e,
+                                lapack_int* m, lapack_int* nsplit, float* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dstebz_work( char range, char order, lapack_int n, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, const double* d, const double* e,
+                                lapack_int* m, lapack_int* nsplit, double* w,
+                                lapack_int* iblock, lapack_int* isplit,
+                                double* work, lapack_int* iwork );
+
+lapack_int LAPACKE_sstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstedc_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstedc_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* work,
+                                lapack_int lwork, double* rwork,
+                                lapack_int lrwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstegr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* isuppz, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_sstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_dstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit, double* z,
+                                lapack_int ldz, double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_cstein_work( int matrix_order, lapack_int n, const float* d,
+                                const float* e, lapack_int m, const float* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+lapack_int LAPACKE_zstein_work( int matrix_order, lapack_int n, const double* d,
+                                const double* e, lapack_int m, const double* w,
+                                const lapack_int* iblock,
+                                const lapack_int* isplit,
+                                lapack_complex_double* z, lapack_int ldz,
+                                double* work, lapack_int* iwork,
+                                lapack_int* ifailv );
+
+lapack_int LAPACKE_sstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int nzc,
+                                lapack_int* isuppz, lapack_logical* tryrac,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_cstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, float* w,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_zstemr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                lapack_int* m, double* w,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int nzc, lapack_int* isuppz,
+                                lapack_logical* tryrac, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_ssteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work );
+lapack_int LAPACKE_dsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work );
+lapack_int LAPACKE_csteqr_work( int matrix_order, char compz, lapack_int n,
+                                float* d, float* e, lapack_complex_float* z,
+                                lapack_int ldz, float* work );
+lapack_int LAPACKE_zsteqr_work( int matrix_order, char compz, lapack_int n,
+                                double* d, double* e, lapack_complex_double* z,
+                                lapack_int ldz, double* work );
+
+lapack_int LAPACKE_ssterf_work( lapack_int n, float* d, float* e );
+lapack_int LAPACKE_dsterf_work( lapack_int n, double* d, double* e );
+
+lapack_int LAPACKE_sstev_work( int matrix_order, char jobz, lapack_int n,
+                               float* d, float* e, float* z, lapack_int ldz,
+                               float* work );
+lapack_int LAPACKE_dstev_work( int matrix_order, char jobz, lapack_int n,
+                               double* d, double* e, double* z, lapack_int ldz,
+                               double* work );
+
+lapack_int LAPACKE_sstevd_work( int matrix_order, char jobz, lapack_int n,
+                                float* d, float* e, float* z, lapack_int ldz,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dstevd_work( int matrix_order, char jobz, lapack_int n,
+                                double* d, double* e, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dstevr_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, float* d, float* e, float vl,
+                                float vu, lapack_int il, lapack_int iu,
+                                float abstol, lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dstevx_work( int matrix_order, char jobz, char range,
+                                lapack_int n, double* d, double* e, double vl,
+                                double vu, lapack_int il, lapack_int iu,
+                                double abstol, lapack_int* m, double* w,
+                                double* z, lapack_int ldz, double* work,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssycon_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, float anorm,
+                                float* rcond, lapack_complex_float* work );
+lapack_int LAPACKE_zsycon_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, double anorm,
+                                double* rcond, lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const float* a, lapack_int lda, float* s,
+                                 float* scond, float* amax, float* work );
+lapack_int LAPACKE_dsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const double* a, lapack_int lda, double* s,
+                                 double* scond, double* amax, double* work );
+lapack_int LAPACKE_csyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 float* s, float* scond, float* amax,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_zsyequb_work( int matrix_order, char uplo, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 double* s, double* scond, double* amax,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_ssyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, float* a, lapack_int lda, float* w,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsyev_work( int matrix_order, char jobz, char uplo,
+                               lapack_int n, double* a, lapack_int lda,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsyevd_work( int matrix_order, char jobz, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, lapack_int* isuppz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dsyevr_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, lapack_int* isuppz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float vl, float vu,
+                                lapack_int il, lapack_int iu, float abstol,
+                                lapack_int* m, float* w, float* z,
+                                lapack_int ldz, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+lapack_int LAPACKE_dsyevx_work( int matrix_order, char jobz, char range,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double vl, double vu,
+                                lapack_int il, lapack_int iu, double abstol,
+                                lapack_int* m, double* w, double* z,
+                                lapack_int ldz, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, float* a, lapack_int lda,
+                                const float* b, lapack_int ldb );
+lapack_int LAPACKE_dsygst_work( int matrix_order, lapack_int itype, char uplo,
+                                lapack_int n, double* a, lapack_int lda,
+                                const double* b, lapack_int ldb );
+
+lapack_int LAPACKE_ssygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, float* a,
+                               lapack_int lda, float* b, lapack_int ldb,
+                               float* w, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsygv_work( int matrix_order, lapack_int itype, char jobz,
+                               char uplo, lapack_int n, double* a,
+                               lapack_int lda, double* b, lapack_int ldb,
+                               double* w, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* w, float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dsygvd_work( int matrix_order, lapack_int itype, char jobz,
+                                char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* w, double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_ssygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float vl, float vu, lapack_int il,
+                                lapack_int iu, float abstol, lapack_int* m,
+                                float* w, float* z, lapack_int ldz, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int* ifail );
+lapack_int LAPACKE_dsygvx_work( int matrix_order, lapack_int itype, char jobz,
+                                char range, char uplo, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double vl, double vu, lapack_int il,
+                                lapack_int iu, double abstol, lapack_int* m,
+                                double* w, double* z, lapack_int ldz,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int* ifail );
+
+lapack_int LAPACKE_ssyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const float* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const double* b, lapack_int ldb, double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                double* work, lapack_int* iwork );
+lapack_int LAPACKE_csyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_zsyrfs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_ssyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const float* a,
+                                 lapack_int lda, const float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const float* b, lapack_int ldb,
+                                 float* x, lapack_int ldx, float* rcond,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs, const double* a,
+                                 lapack_int lda, const double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s, const double* b,
+                                 lapack_int ldb, double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, double* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_csyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const float* s, const lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* x,
+                                 lapack_int ldx, float* rcond, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsyrfsx_work( int matrix_order, char uplo, char equed,
+                                 lapack_int n, lapack_int nrhs,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* af,
+                                 lapack_int ldaf, const lapack_int* ipiv,
+                                 const double* s,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, float* a, lapack_int lda,
+                               lapack_int* ipiv, float* b, lapack_int ldb,
+                               float* work, lapack_int lwork );
+lapack_int LAPACKE_dsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, double* a, lapack_int lda,
+                               lapack_int* ipiv, double* b, lapack_int ldb,
+                               double* work, lapack_int lwork );
+lapack_int LAPACKE_csysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zsysv_work( int matrix_order, char uplo, lapack_int n,
+                               lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx,
+                                float* rcond, float* ferr, float* berr,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, double* af, lapack_int ldaf,
+                                lapack_int* ipiv, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_csysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* ferr,
+                                float* berr, lapack_complex_float* work,
+                                lapack_int lwork, float* rwork );
+lapack_int LAPACKE_zsysvx_work( int matrix_order, char fact, char uplo,
+                                lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* ferr, double* berr,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork );
+
+lapack_int LAPACKE_ssysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, float* a,
+                                 lapack_int lda, float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 float* b, lapack_int ldb, float* x,
+                                 lapack_int ldx, float* rcond, float* rpvgrw,
+                                 float* berr, lapack_int n_err_bnds,
+                                 float* err_bnds_norm, float* err_bnds_comp,
+                                 lapack_int nparams, float* params, float* work,
+                                 lapack_int* iwork );
+lapack_int LAPACKE_dsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs, double* a,
+                                 lapack_int lda, double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 double* b, lapack_int ldb, double* x,
+                                 lapack_int ldx, double* rcond, double* rpvgrw,
+                                 double* berr, lapack_int n_err_bnds,
+                                 double* err_bnds_norm, double* err_bnds_comp,
+                                 lapack_int nparams, double* params,
+                                 double* work, lapack_int* iwork );
+lapack_int LAPACKE_csysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, float* s,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 float* rcond, float* rpvgrw, float* berr,
+                                 lapack_int n_err_bnds, float* err_bnds_norm,
+                                 float* err_bnds_comp, lapack_int nparams,
+                                 float* params, lapack_complex_float* work,
+                                 float* rwork );
+lapack_int LAPACKE_zsysvxx_work( int matrix_order, char fact, char uplo,
+                                 lapack_int n, lapack_int nrhs,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* af, lapack_int ldaf,
+                                 lapack_int* ipiv, char* equed, double* s,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 double* rcond, double* rpvgrw, double* berr,
+                                 lapack_int n_err_bnds, double* err_bnds_norm,
+                                 double* err_bnds_comp, lapack_int nparams,
+                                 double* params, lapack_complex_double* work,
+                                 double* rwork );
+
+lapack_int LAPACKE_ssytrd_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, float* d, float* e,
+                                float* tau, float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrd_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, double* d, double* e,
+                                double* tau, double* work, lapack_int lwork );
+
+lapack_int LAPACKE_ssytrf_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda, lapack_int* ipiv,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda, lapack_int* ipiv,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_csytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_zsytrf_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_int* ipiv, lapack_complex_double* work,
+                                lapack_int lwork );
+
+lapack_int LAPACKE_ssytri_work( int matrix_order, char uplo, lapack_int n,
+                                float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_dsytri_work( int matrix_order, char uplo, lapack_int n,
+                                double* a, lapack_int lda,
+                                const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_csytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zsytri_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_ssytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                double* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_int nrhs, const lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const float* ab, lapack_int ldab, float* rcond,
+                                float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const double* ab, lapack_int ldab,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_float* ab, lapack_int ldab,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztbcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, lapack_int kd,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, const lapack_complex_float* b,
+                                lapack_int ldb, const lapack_complex_float* x,
+                                lapack_int ldx, float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztbrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, const lapack_complex_double* b,
+                                lapack_int ldb, const lapack_complex_double* x,
+                                lapack_int ldx, double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const float* ab,
+                                lapack_int ldab, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const double* ab,
+                                lapack_int ldab, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs, const lapack_complex_float* ab,
+                                lapack_int ldab, lapack_complex_float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ztbtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int kd,
+                                lapack_int nrhs,
+                                const lapack_complex_double* ab,
+                                lapack_int ldab, lapack_complex_double* b,
+                                lapack_int ldb );
+
+lapack_int LAPACKE_stfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, float alpha, const float* a,
+                               float* b, lapack_int ldb );
+lapack_int LAPACKE_dtfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, double alpha, const double* a,
+                               double* b, lapack_int ldb );
+lapack_int LAPACKE_ctfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_float alpha,
+                               const lapack_complex_float* a,
+                               lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztfsm_work( int matrix_order, char transr, char side,
+                               char uplo, char trans, char diag, lapack_int m,
+                               lapack_int n, lapack_complex_double alpha,
+                               const lapack_complex_double* a,
+                               lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, float* a );
+lapack_int LAPACKE_dtftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n, double* a );
+lapack_int LAPACKE_ctftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_float* a );
+lapack_int LAPACKE_ztftri_work( int matrix_order, char transr, char uplo,
+                                char diag, lapack_int n,
+                                lapack_complex_double* a );
+
+lapack_int LAPACKE_stfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* ap );
+lapack_int LAPACKE_dtfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* ap );
+lapack_int LAPACKE_ctfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztfttp_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* arf, float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_dtfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* arf, double* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ctfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* arf,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztfttr_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* arf,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_stgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* s, lapack_int lds, const float* p,
+                                lapack_int ldp, float* vl, lapack_int ldvl,
+                                float* vr, lapack_int ldvr, lapack_int mm,
+                                lapack_int* m, float* work );
+lapack_int LAPACKE_dtgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* s, lapack_int lds,
+                                const double* p, lapack_int ldp, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* s, lapack_int lds,
+                                const lapack_complex_float* p, lapack_int ldp,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztgevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* s, lapack_int lds,
+                                const lapack_complex_double* p, lapack_int ldp,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, float* a,
+                                lapack_int lda, float* b, lapack_int ldb,
+                                float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dtgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* q, lapack_int ldq, double* z,
+                                lapack_int ldz, lapack_int* ifst,
+                                lapack_int* ilst, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_ctgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztgexc_work( int matrix_order, lapack_logical wantq,
+                                lapack_logical wantz, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_stgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* alphar, float* alphai,
+                                float* beta, float* q, lapack_int ldq, float* z,
+                                lapack_int ldz, lapack_int* m, float* pl,
+                                float* pr, float* dif, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+lapack_int LAPACKE_dtgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* alphar, double* alphai,
+                                double* beta, double* q, lapack_int ldq,
+                                double* z, lapack_int ldz, lapack_int* m,
+                                double* pl, double* pr, double* dif,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* alpha,
+                                lapack_complex_float* beta,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_int* m, float* pl, float* pr, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ztgsen_work( int matrix_order, lapack_int ijob,
+                                lapack_logical wantq, lapack_logical wantz,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* alpha,
+                                lapack_complex_double* beta,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_int* m, double* pl, double* pr,
+                                double* dif, lapack_complex_double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_stgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float tola, float tolb,
+                                float* alpha, float* beta, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                float* q, lapack_int ldq, float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_dtgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double tola, double tolb,
+                                double* alpha, double* beta, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                double* q, lapack_int ldq, double* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ctgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                float tola, float tolb, float* alpha,
+                                float* beta, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* v,
+                                lapack_int ldv, lapack_complex_float* q,
+                                lapack_int ldq, lapack_complex_float* work,
+                                lapack_int* ncycle );
+lapack_int LAPACKE_ztgsja_work( int matrix_order, char jobu, char jobv,
+                                char jobq, lapack_int m, lapack_int p,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                double tola, double tolb, double* alpha,
+                                double* beta, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* v,
+                                lapack_int ldv, lapack_complex_double* q,
+                                lapack_int ldq, lapack_complex_double* work,
+                                lapack_int* ncycle );
+
+lapack_int LAPACKE_stgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* dif,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_dtgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* dif, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* dif, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int lwork, lapack_int* iwork );
+lapack_int LAPACKE_ztgsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* dif,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const float* a,
+                                lapack_int lda, const float* b, lapack_int ldb,
+                                float* c, lapack_int ldc, const float* d,
+                                lapack_int ldd, const float* e, lapack_int lde,
+                                float* f, lapack_int ldf, float* scale,
+                                float* dif, float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n, const double* a,
+                                lapack_int lda, const double* b, lapack_int ldb,
+                                double* c, lapack_int ldc, const double* d,
+                                lapack_int ldd, const double* e, lapack_int lde,
+                                double* f, lapack_int ldf, double* scale,
+                                double* dif, double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                const lapack_complex_float* d, lapack_int ldd,
+                                const lapack_complex_float* e, lapack_int lde,
+                                lapack_complex_float* f, lapack_int ldf,
+                                float* scale, float* dif,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ztgsyl_work( int matrix_order, char trans, lapack_int ijob,
+                                lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                const lapack_complex_double* d, lapack_int ldd,
+                                const lapack_complex_double* e, lapack_int lde,
+                                lapack_complex_double* f, lapack_int ldf,
+                                double* scale, double* dif,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork );
+
+lapack_int LAPACKE_stpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* ap,
+                                float* rcond, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* ap,
+                                double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* ap, float* rcond,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztpcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* ap, double* rcond,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, const float* b, lapack_int ldb,
+                                const float* x, lapack_int ldx, float* ferr,
+                                float* berr, float* work, lapack_int* iwork );
+lapack_int LAPACKE_dtprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, const double* b,
+                                lapack_int ldb, const double* x, lapack_int ldx,
+                                double* ferr, double* berr, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztprfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_stptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* ap );
+lapack_int LAPACKE_dtptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* ap );
+lapack_int LAPACKE_ctptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* ap );
+lapack_int LAPACKE_ztptri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* ap );
+
+lapack_int LAPACKE_stptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* ap, float* b, lapack_int ldb );
+lapack_int LAPACKE_dtptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* ap, double* b, lapack_int ldb );
+lapack_int LAPACKE_ctptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztptrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_stpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* ap, float* arf );
+lapack_int LAPACKE_dtpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* ap, double* arf );
+lapack_int LAPACKE_ctpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* ap,
+                                lapack_complex_float* arf );
+lapack_int LAPACKE_ztpttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* ap,
+                                lapack_complex_double* arf );
+
+lapack_int LAPACKE_stpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const float* ap, float* a, lapack_int lda );
+lapack_int LAPACKE_dtpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const double* ap, double* a, lapack_int lda );
+lapack_int LAPACKE_ctpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_ztpttr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_strcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const float* a,
+                                lapack_int lda, float* rcond, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n, const double* a,
+                                lapack_int lda, double* rcond, double* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                float* rcond, lapack_complex_float* work,
+                                float* rwork );
+lapack_int LAPACKE_ztrcon_work( int matrix_order, char norm, char uplo,
+                                char diag, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                double* rcond, lapack_complex_double* work,
+                                double* rwork );
+
+lapack_int LAPACKE_strevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, float* vl,
+                                lapack_int ldvl, float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, float* work );
+lapack_int LAPACKE_dtrevc_work( int matrix_order, char side, char howmny,
+                                lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt, double* vl,
+                                lapack_int ldvl, double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m, double* work );
+lapack_int LAPACKE_ctrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* vl, lapack_int ldvl,
+                                lapack_complex_float* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrevc_work( int matrix_order, char side, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* vl, lapack_int ldvl,
+                                lapack_complex_double* vr, lapack_int ldvr,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strexc_work( int matrix_order, char compq, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, float* work );
+lapack_int LAPACKE_dtrexc_work( int matrix_order, char compq, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, lapack_int* ifst,
+                                lapack_int* ilst, double* work );
+lapack_int LAPACKE_ctrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+lapack_int LAPACKE_ztrexc_work( int matrix_order, char compq, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_int ifst, lapack_int ilst );
+
+lapack_int LAPACKE_strrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, const float* x, lapack_int ldx,
+                                float* ferr, float* berr, float* work,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dtrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb,
+                                const double* x, lapack_int ldx, double* ferr,
+                                double* berr, double* work, lapack_int* iwork );
+lapack_int LAPACKE_ctrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                const lapack_complex_float* x, lapack_int ldx,
+                                float* ferr, float* berr,
+                                lapack_complex_float* work, float* rwork );
+lapack_int LAPACKE_ztrrfs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                const lapack_complex_double* x, lapack_int ldx,
+                                double* ferr, double* berr,
+                                lapack_complex_double* work, double* rwork );
+
+lapack_int LAPACKE_strsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                float* t, lapack_int ldt, float* q,
+                                lapack_int ldq, float* wr, float* wi,
+                                lapack_int* m, float* s, float* sep,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_dtrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                double* t, lapack_int ldt, double* q,
+                                lapack_int ldq, double* wr, double* wi,
+                                lapack_int* m, double* s, double* sep,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+lapack_int LAPACKE_ctrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* w, lapack_int* m,
+                                float* s, float* sep,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztrsen_work( int matrix_order, char job, char compq,
+                                const lapack_logical* select, lapack_int n,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* w, lapack_int* m,
+                                double* s, double* sep,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_strsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const float* t, lapack_int ldt, const float* vl,
+                                lapack_int ldvl, const float* vr,
+                                lapack_int ldvr, float* s, float* sep,
+                                lapack_int mm, lapack_int* m, float* work,
+                                lapack_int ldwork, lapack_int* iwork );
+lapack_int LAPACKE_dtrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const double* t, lapack_int ldt,
+                                const double* vl, lapack_int ldvl,
+                                const double* vr, lapack_int ldvr, double* s,
+                                double* sep, lapack_int mm, lapack_int* m,
+                                double* work, lapack_int ldwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ctrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                const lapack_complex_float* vl, lapack_int ldvl,
+                                const lapack_complex_float* vr, lapack_int ldvr,
+                                float* s, float* sep, lapack_int mm,
+                                lapack_int* m, lapack_complex_float* work,
+                                lapack_int ldwork, float* rwork );
+lapack_int LAPACKE_ztrsna_work( int matrix_order, char job, char howmny,
+                                const lapack_logical* select, lapack_int n,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                const lapack_complex_double* vl,
+                                lapack_int ldvl,
+                                const lapack_complex_double* vr,
+                                lapack_int ldvr, double* s, double* sep,
+                                lapack_int mm, lapack_int* m,
+                                lapack_complex_double* work, lapack_int ldwork,
+                                double* rwork );
+
+lapack_int LAPACKE_strsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const float* a, lapack_int lda, const float* b,
+                                lapack_int ldb, float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_dtrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const double* a, lapack_int lda,
+                                const double* b, lapack_int ldb, double* c,
+                                lapack_int ldc, double* scale );
+lapack_int LAPACKE_ctrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* c, lapack_int ldc,
+                                float* scale );
+lapack_int LAPACKE_ztrsyl_work( int matrix_order, char trana, char tranb,
+                                lapack_int isgn, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* c, lapack_int ldc,
+                                double* scale );
+
+lapack_int LAPACKE_strtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, float* a, lapack_int lda );
+lapack_int LAPACKE_dtrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, double* a, lapack_int lda );
+lapack_int LAPACKE_ctrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_float* a,
+                                lapack_int lda );
+lapack_int LAPACKE_ztrtri_work( int matrix_order, char uplo, char diag,
+                                lapack_int n, lapack_complex_double* a,
+                                lapack_int lda );
+
+lapack_int LAPACKE_strtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const float* a, lapack_int lda, float* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_dtrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda, double* b,
+                                lapack_int ldb );
+lapack_int LAPACKE_ctrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztrtrs_work( int matrix_order, char uplo, char trans,
+                                char diag, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_strttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const float* a, lapack_int lda,
+                                float* arf );
+lapack_int LAPACKE_dtrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const double* a, lapack_int lda,
+                                double* arf );
+lapack_int LAPACKE_ctrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* arf );
+lapack_int LAPACKE_ztrttf_work( int matrix_order, char transr, char uplo,
+                                lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* arf );
+
+lapack_int LAPACKE_strttp_work( int matrix_order, char uplo, lapack_int n,
+                                const float* a, lapack_int lda, float* ap );
+lapack_int LAPACKE_dtrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const double* a, lapack_int lda, double* ap );
+lapack_int LAPACKE_ctrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* ap );
+lapack_int LAPACKE_ztrttp_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* ap );
+
+lapack_int LAPACKE_stzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_dtzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_ctzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ztzrzf_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungbr_work( int matrix_order, char vect, lapack_int m,
+                                lapack_int n, lapack_int k,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunghr_work( int matrix_order, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunglq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungql_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungqr_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungrq_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int k, lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtr_work( int matrix_order, char uplo, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmbr_work( int matrix_order, char vect, char side,
+                                char trans, lapack_int m, lapack_int n,
+                                lapack_int k, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmhr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int ilo,
+                                lapack_int ihi, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmlq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmql_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmqr_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrq_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_float* a,
+                                lapack_int lda, const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmrz_work( int matrix_order, char side, char trans,
+                                lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, const lapack_complex_double* a,
+                                lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* a, lapack_int lda,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zunmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* a, lapack_int lda,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work, lapack_int lwork );
+
+lapack_int LAPACKE_cupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* q, lapack_int ldq,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupgtr_work( int matrix_order, char uplo, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* q, lapack_int ldq,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_cupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_float* ap,
+                                const lapack_complex_float* tau,
+                                lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work );
+lapack_int LAPACKE_zupmtr_work( int matrix_order, char side, char uplo,
+                                char trans, lapack_int m, lapack_int n,
+                                const lapack_complex_double* ap,
+                                const lapack_complex_double* tau,
+                                lapack_complex_double* c, lapack_int ldc,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_claghe( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlaghe( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, float* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_dlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, double* a, lapack_int lda,
+                           lapack_int* iseed );
+lapack_int LAPACKE_clagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const float* d, lapack_complex_float* a,
+                           lapack_int lda, lapack_int* iseed );
+lapack_int LAPACKE_zlagsy( int matrix_order, lapack_int n, lapack_int k,
+                           const double* d, lapack_complex_double* a,
+                           lapack_int lda, lapack_int* iseed );
+
+lapack_int LAPACKE_slapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, float* x, lapack_int ldx,
+                           lapack_int* k );
+lapack_int LAPACKE_dlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, double* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_clapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_float* x,
+                           lapack_int ldx, lapack_int* k );
+lapack_int LAPACKE_zlapmr( int matrix_order, lapack_logical forwrd,
+                           lapack_int m, lapack_int n, lapack_complex_double* x,
+                           lapack_int ldx, lapack_int* k );
+
+
+float LAPACKE_slapy2( float x, float y );
+double LAPACKE_dlapy2( double x, double y );
+
+float LAPACKE_slapy3( float x, float y, float z );
+double LAPACKE_dlapy3( double x, double y, double z );
+
+lapack_int LAPACKE_slartgp( float f, float g, float* cs, float* sn, float* r );
+lapack_int LAPACKE_dlartgp( double f, double g, double* cs, double* sn,
+                            double* r );
+
+lapack_int LAPACKE_slartgs( float x, float y, float sigma, float* cs,
+                            float* sn );
+lapack_int LAPACKE_dlartgs( double x, double y, double sigma, double* cs,
+                            double* sn );
+
+
+//LAPACK 3.3.0
+lapack_int LAPACKE_cbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           lapack_complex_float* u1, lapack_int ldu1,
+                           lapack_complex_float* u2, lapack_int ldu2,
+                           lapack_complex_float* v1t, lapack_int ldv1t,
+                           lapack_complex_float* v2t, lapack_int ldv2t,
+                           float* b11d, float* b11e, float* b12d, float* b12e,
+                           float* b21d, float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_cbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                float* b11d, float* b11e, float* b12d,
+                                float* b12e, float* b21d, float* b21e,
+                                float* b22d, float* b22e, float* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_cheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_cheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_chetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_chetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_chetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_chetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_chetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_chetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_csyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_csyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_csytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_csytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_csytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_csytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_float* work, lapack_int nb );
+lapack_int LAPACKE_csytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_csytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_cunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, float* phi,
+                           lapack_complex_float* taup1,
+                           lapack_complex_float* taup2,
+                           lapack_complex_float* tauq1,
+                           lapack_complex_float* tauq2 );
+lapack_int LAPACKE_cunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_float* x11, lapack_int ldx11,
+                                lapack_complex_float* x12, lapack_int ldx12,
+                                lapack_complex_float* x21, lapack_int ldx21,
+                                lapack_complex_float* x22, lapack_int ldx22,
+                                float* theta, float* phi,
+                                lapack_complex_float* taup1,
+                                lapack_complex_float* taup2,
+                                lapack_complex_float* tauq1,
+                                lapack_complex_float* tauq2,
+                                lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_cuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_float* x11, lapack_int ldx11,
+                           lapack_complex_float* x12, lapack_int ldx12,
+                           lapack_complex_float* x21, lapack_int ldx21,
+                           lapack_complex_float* x22, lapack_int ldx22,
+                           float* theta, lapack_complex_float* u1,
+                           lapack_int ldu1, lapack_complex_float* u2,
+                           lapack_int ldu2, lapack_complex_float* v1t,
+                           lapack_int ldv1t, lapack_complex_float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_cuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_float* x11,
+                                lapack_int ldx11, lapack_complex_float* x12,
+                                lapack_int ldx12, lapack_complex_float* x21,
+                                lapack_int ldx21, lapack_complex_float* x22,
+                                lapack_int ldx22, float* theta,
+                                lapack_complex_float* u1, lapack_int ldu1,
+                                lapack_complex_float* u2, lapack_int ldu2,
+                                lapack_complex_float* v1t, lapack_int ldv1t,
+                                lapack_complex_float* v2t, lapack_int ldv2t,
+                                lapack_complex_float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t, double* b11d,
+                           double* b11e, double* b12d, double* b12e,
+                           double* b21d, double* b21e, double* b22d,
+                           double* b22e );
+lapack_int LAPACKE_dbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi, double* u1,
+                                lapack_int ldu1, double* u2, lapack_int ldu2,
+                                double* v1t, lapack_int ldv1t, double* v2t,
+                                lapack_int ldv2t, double* b11d, double* b11e,
+                                double* b12d, double* b12e, double* b21d,
+                                double* b21e, double* b22d, double* b22e,
+                                double* work, lapack_int lwork );
+lapack_int LAPACKE_dorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* phi, double* taup1, double* taup2,
+                           double* tauq1, double* tauq2 );
+lapack_int LAPACKE_dorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* x11, lapack_int ldx11, double* x12,
+                                lapack_int ldx12, double* x21, lapack_int ldx21,
+                                double* x22, lapack_int ldx22, double* theta,
+                                double* phi, double* taup1, double* taup2,
+                                double* tauq1, double* tauq2, double* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_dorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           double* x11, lapack_int ldx11, double* x12,
+                           lapack_int ldx12, double* x21, lapack_int ldx21,
+                           double* x22, lapack_int ldx22, double* theta,
+                           double* u1, lapack_int ldu1, double* u2,
+                           lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                           double* v2t, lapack_int ldv2t );
+lapack_int LAPACKE_dorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, double* x11, lapack_int ldx11,
+                                double* x12, lapack_int ldx12, double* x21,
+                                lapack_int ldx21, double* x22, lapack_int ldx22,
+                                double* theta, double* u1, lapack_int ldu1,
+                                double* u2, lapack_int ldu2, double* v1t,
+                                lapack_int ldv1t, double* v2t, lapack_int ldv2t,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_dsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, double* a, lapack_int lda,
+                                 const lapack_int* ipiv, double* work );
+lapack_int LAPACKE_dsyswapr( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_dsytri2( int matrix_order, char uplo, lapack_int n,
+                            double* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_dsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_dsytri2x( int matrix_order, char uplo, lapack_int n,
+                             double* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_dsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  double* a, lapack_int lda,
+                                  const lapack_int* ipiv, double* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_dsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const double* a, lapack_int lda,
+                            const lapack_int* ipiv, double* b, lapack_int ldb );
+lapack_int LAPACKE_dsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_sbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, float* theta, float* phi,
+                           float* u1, lapack_int ldu1, float* u2,
+                           lapack_int ldu2, float* v1t, lapack_int ldv1t,
+                           float* v2t, lapack_int ldv2t, float* b11d,
+                           float* b11e, float* b12d, float* b12e, float* b21d,
+                           float* b21e, float* b22d, float* b22e );
+lapack_int LAPACKE_sbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* theta, float* phi, float* u1,
+                                lapack_int ldu1, float* u2, lapack_int ldu2,
+                                float* v1t, lapack_int ldv1t, float* v2t,
+                                lapack_int ldv2t, float* b11d, float* b11e,
+                                float* b12d, float* b12e, float* b21d,
+                                float* b21e, float* b22d, float* b22e,
+                                float* work, lapack_int lwork );
+lapack_int LAPACKE_sorbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* phi,
+                           float* taup1, float* taup2, float* tauq1,
+                           float* tauq2 );
+lapack_int LAPACKE_sorbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                float* x11, lapack_int ldx11, float* x12,
+                                lapack_int ldx12, float* x21, lapack_int ldx21,
+                                float* x22, lapack_int ldx22, float* theta,
+                                float* phi, float* taup1, float* taup2,
+                                float* tauq1, float* tauq2, float* work,
+                                lapack_int lwork );
+lapack_int LAPACKE_sorcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q, float* x11,
+                           lapack_int ldx11, float* x12, lapack_int ldx12,
+                           float* x21, lapack_int ldx21, float* x22,
+                           lapack_int ldx22, float* theta, float* u1,
+                           lapack_int ldu1, float* u2, lapack_int ldu2,
+                           float* v1t, lapack_int ldv1t, float* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_sorcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, float* x11, lapack_int ldx11,
+                                float* x12, lapack_int ldx12, float* x21,
+                                lapack_int ldx21, float* x22, lapack_int ldx22,
+                                float* theta, float* u1, lapack_int ldu1,
+                                float* u2, lapack_int ldu2, float* v1t,
+                                lapack_int ldv1t, float* v2t, lapack_int ldv2t,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork );
+lapack_int LAPACKE_ssyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            float* a, lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, float* a, lapack_int lda,
+                                 const lapack_int* ipiv, float* work );
+lapack_int LAPACKE_ssyswapr( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int i1, lapack_int i2 );
+lapack_int LAPACKE_ssytri2( int matrix_order, char uplo, lapack_int n, float* a,
+                            lapack_int lda, const lapack_int* ipiv );
+lapack_int LAPACKE_ssytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 float* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_ssytri2x( int matrix_order, char uplo, lapack_int n,
+                             float* a, lapack_int lda, const lapack_int* ipiv,
+                             lapack_int nb );
+lapack_int LAPACKE_ssytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  float* a, lapack_int lda,
+                                  const lapack_int* ipiv, float* work,
+                                  lapack_int nb );
+lapack_int LAPACKE_ssytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const float* a, lapack_int lda,
+                            const lapack_int* ipiv, float* b, lapack_int ldb );
+lapack_int LAPACKE_ssytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const float* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 float* b, lapack_int ldb, float* work );
+lapack_int LAPACKE_zbbcsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, lapack_int m,
+                           lapack_int p, lapack_int q, double* theta,
+                           double* phi, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t, double* b11d, double* b11e,
+                           double* b12d, double* b12e, double* b21d,
+                           double* b21e, double* b22d, double* b22e );
+lapack_int LAPACKE_zbbcsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                double* theta, double* phi,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                double* b11d, double* b11e, double* b12d,
+                                double* b12e, double* b21d, double* b21e,
+                                double* b22d, double* b22e, double* rwork,
+                                lapack_int lrwork );
+lapack_int LAPACKE_zheswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zheswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zhetri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zhetri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zhetri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zhetri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zhetrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zhetrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyconv( int matrix_order, char uplo, char way, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsyconv_work( int matrix_order, char uplo, char way,
+                                 lapack_int n, lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zsyswapr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int i1,
+                             lapack_int i2 );
+lapack_int LAPACKE_zsyswapr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int i1,
+                                  lapack_int i2 );
+lapack_int LAPACKE_zsytri2( int matrix_order, char uplo, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv );
+lapack_int LAPACKE_zsytri2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv,
+                                 lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zsytri2x( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double* a, lapack_int lda,
+                             const lapack_int* ipiv, lapack_int nb );
+lapack_int LAPACKE_zsytri2x_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double* a, lapack_int lda,
+                                  const lapack_int* ipiv,
+                                  lapack_complex_double* work, lapack_int nb );
+lapack_int LAPACKE_zsytrs2( int matrix_order, char uplo, lapack_int n,
+                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int lda, const lapack_int* ipiv,
+                            lapack_complex_double* b, lapack_int ldb );
+lapack_int LAPACKE_zsytrs2_work( int matrix_order, char uplo, lapack_int n,
+                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int lda, const lapack_int* ipiv,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+lapack_int LAPACKE_zunbdb( int matrix_order, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, double* phi,
+                           lapack_complex_double* taup1,
+                           lapack_complex_double* taup2,
+                           lapack_complex_double* tauq1,
+                           lapack_complex_double* tauq2 );
+lapack_int LAPACKE_zunbdb_work( int matrix_order, char trans, char signs,
+                                lapack_int m, lapack_int p, lapack_int q,
+                                lapack_complex_double* x11, lapack_int ldx11,
+                                lapack_complex_double* x12, lapack_int ldx12,
+                                lapack_complex_double* x21, lapack_int ldx21,
+                                lapack_complex_double* x22, lapack_int ldx22,
+                                double* theta, double* phi,
+                                lapack_complex_double* taup1,
+                                lapack_complex_double* taup2,
+                                lapack_complex_double* tauq1,
+                                lapack_complex_double* tauq2,
+                                lapack_complex_double* work, lapack_int lwork );
+lapack_int LAPACKE_zuncsd( int matrix_order, char jobu1, char jobu2,
+                           char jobv1t, char jobv2t, char trans, char signs,
+                           lapack_int m, lapack_int p, lapack_int q,
+                           lapack_complex_double* x11, lapack_int ldx11,
+                           lapack_complex_double* x12, lapack_int ldx12,
+                           lapack_complex_double* x21, lapack_int ldx21,
+                           lapack_complex_double* x22, lapack_int ldx22,
+                           double* theta, lapack_complex_double* u1,
+                           lapack_int ldu1, lapack_complex_double* u2,
+                           lapack_int ldu2, lapack_complex_double* v1t,
+                           lapack_int ldv1t, lapack_complex_double* v2t,
+                           lapack_int ldv2t );
+lapack_int LAPACKE_zuncsd_work( int matrix_order, char jobu1, char jobu2,
+                                char jobv1t, char jobv2t, char trans,
+                                char signs, lapack_int m, lapack_int p,
+                                lapack_int q, lapack_complex_double* x11,
+                                lapack_int ldx11, lapack_complex_double* x12,
+                                lapack_int ldx12, lapack_complex_double* x21,
+                                lapack_int ldx21, lapack_complex_double* x22,
+                                lapack_int ldx22, double* theta,
+                                lapack_complex_double* u1, lapack_int ldu1,
+                                lapack_complex_double* u2, lapack_int ldu2,
+                                lapack_complex_double* v1t, lapack_int ldv1t,
+                                lapack_complex_double* v2t, lapack_int ldv2t,
+                                lapack_complex_double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork,
+                                lapack_int* iwork );
+//LAPACK 3.4.0
+lapack_int LAPACKE_sgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const float* v, lapack_int ldv,
+                            const float* t, lapack_int ldt, float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_dgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const double* v, lapack_int ldv,
+                            const double* t, lapack_int ldt, double* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_cgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_float* v,
+                            lapack_int ldv, const lapack_complex_float* t,
+                            lapack_int ldt, lapack_complex_float* c,
+                            lapack_int ldc );
+lapack_int LAPACKE_zgemqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int nb, const lapack_complex_double* v,
+                            lapack_int ldv, const lapack_complex_double* t,
+                            lapack_int ldt, lapack_complex_double* c,
+                            lapack_int ldc );
+
+lapack_int LAPACKE_sgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, float* a, lapack_int lda, float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_dgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, double* a, lapack_int lda, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_cgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_zgeqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int nb, lapack_complex_double* a,
+                           lapack_int lda, lapack_complex_double* t,
+                           lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* t,
+                            lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const float* v,
+                            lapack_int ldv, const float* t, lapack_int ldt,
+                            float* a, lapack_int lda, float* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_dtpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb, const double* v,
+                            lapack_int ldv, const double* t, lapack_int ldt,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb );
+lapack_int LAPACKE_ctpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_float* v, lapack_int ldv,
+                            const lapack_complex_float* t, lapack_int ldt,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb );
+lapack_int LAPACKE_ztpmqrt( int matrix_order, char side, char trans,
+                            lapack_int m, lapack_int n, lapack_int k,
+                            lapack_int l, lapack_int nb,
+                            const lapack_complex_double* v, lapack_int ldv,
+                            const lapack_complex_double* t, lapack_int ldt,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb );
+
+lapack_int LAPACKE_dtpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, double* a,
+                           lapack_int lda, double* b, lapack_int ldb, double* t,
+                           lapack_int ldt );
+lapack_int LAPACKE_ctpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb, lapack_complex_float* a,
+                           lapack_int lda, lapack_complex_float* t,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int ldt );
+lapack_int LAPACKE_ztpqrt( int matrix_order, lapack_int m, lapack_int n,
+                           lapack_int l, lapack_int nb,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            float* a, lapack_int lda, float* b, lapack_int ldb,
+                            float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            double* a, lapack_int lda, double* b,
+                            lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_float* a, lapack_int lda,
+                            lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2( int matrix_order, lapack_int m, lapack_int n,
+                            lapack_complex_double* a, lapack_int lda,
+                            lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const float* v,
+                           lapack_int ldv, const float* t, lapack_int ldt,
+                           float* a, lapack_int lda, float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_dtprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l, const double* v,
+                           lapack_int ldv, const double* t, lapack_int ldt,
+                           double* a, lapack_int lda, double* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ctprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* t, lapack_int ldt,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           lapack_int myldwork );
+lapack_int LAPACKE_ztprfb( int matrix_order, char side, char trans, char direct,
+                           char storev, lapack_int m, lapack_int n,
+                           lapack_int k, lapack_int l,
+                           const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* t, lapack_int ldt,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_int myldwork );
+
+lapack_int LAPACKE_sgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const float* v, lapack_int ldv,
+                                 const float* t, lapack_int ldt, float* c,
+                                 lapack_int ldc, float* work );
+lapack_int LAPACKE_dgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const double* v, lapack_int ldv,
+                                 const double* t, lapack_int ldt, double* c,
+                                 lapack_int ldc, double* work );
+lapack_int LAPACKE_cgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_float* v,
+                                 lapack_int ldv, const lapack_complex_float* t,
+                                 lapack_int ldt, lapack_complex_float* c,
+                                 lapack_int ldc, lapack_complex_float* work );
+lapack_int LAPACKE_zgemqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int nb, const lapack_complex_double* v,
+                                 lapack_int ldv, const lapack_complex_double* t,
+                                 lapack_int ldt, lapack_complex_double* c,
+                                 lapack_int ldc, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, float* a, lapack_int lda,
+                                float* t, lapack_int ldt, float* work );
+lapack_int LAPACKE_dgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, double* a, lapack_int lda,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_cgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_float* a,
+                                lapack_int lda, lapack_complex_float* t,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_zgeqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int nb, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* t,
+                                lapack_int ldt, lapack_complex_double* work );
+
+lapack_int LAPACKE_sgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_dgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* t,
+                                 lapack_int ldt );
+lapack_int LAPACKE_cgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgeqrt3_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const float* v,
+                                 lapack_int ldv, const float* t, lapack_int ldt,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* work );
+lapack_int LAPACKE_dtpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb, const double* v,
+                                 lapack_int ldv, const double* t,
+                                 lapack_int ldt, double* a, lapack_int lda,
+                                 double* b, lapack_int ldb, double* work );
+lapack_int LAPACKE_ctpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_float* v, lapack_int ldv,
+                                 const lapack_complex_float* t, lapack_int ldt,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* work );
+lapack_int LAPACKE_ztpmqrt_work( int matrix_order, char side, char trans,
+                                 lapack_int m, lapack_int n, lapack_int k,
+                                 lapack_int l, lapack_int nb,
+                                 const lapack_complex_double* v, lapack_int ldv,
+                                 const lapack_complex_double* t, lapack_int ldt,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* work );
+
+lapack_int LAPACKE_dtpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                double* t, lapack_int ldt, double* work );
+lapack_int LAPACKE_ctpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* t,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_int ldt, lapack_complex_float* work );
+lapack_int LAPACKE_ztpqrt_work( int matrix_order, lapack_int m, lapack_int n,
+                                lapack_int l, lapack_int nb,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* work );
+
+lapack_int LAPACKE_stpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 float* a, lapack_int lda, float* b,
+                                 lapack_int ldb, float* t, lapack_int ldt );
+lapack_int LAPACKE_dtpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 double* a, lapack_int lda, double* b,
+                                 lapack_int ldb, double* t, lapack_int ldt );
+lapack_int LAPACKE_ctpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_ztpqrt2_work( int matrix_order, lapack_int m, lapack_int n,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_stprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const float* v, lapack_int ldv, const float* t,
+                                lapack_int ldt, float* a, lapack_int lda,
+                                float* b, lapack_int ldb, const float* mywork,
+                                lapack_int myldwork );
+lapack_int LAPACKE_dtprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const double* v, lapack_int ldv,
+                                const double* t, lapack_int ldt, double* a,
+                                lapack_int lda, double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ctprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt,
+                                lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb,
+                                const float* mywork, lapack_int myldwork );
+lapack_int LAPACKE_ztprfb_work( int matrix_order, char side, char trans,
+                                char direct, char storev, lapack_int m,
+                                lapack_int n, lapack_int k, lapack_int l,
+                                const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt,
+                                lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb,
+                                const double* mywork, lapack_int myldwork );
+//LAPACK 3.X.X
+lapack_int LAPACKE_csyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_float alpha,
+                             const lapack_complex_float* x, lapack_int incx,
+                             lapack_complex_float* a, lapack_int lda );
+lapack_int LAPACKE_zsyr( int matrix_order, char uplo, lapack_int n,
+                             lapack_complex_double alpha,
+                             const lapack_complex_double* x, lapack_int incx,
+                             lapack_complex_double* a, lapack_int lda );
+
+lapack_int LAPACKE_csyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_float alpha,
+                                  const lapack_complex_float* x,
+                                  lapack_int incx, lapack_complex_float* a,
+                                  lapack_int lda );
+lapack_int LAPACKE_zsyr_work( int matrix_order, char uplo, lapack_int n,
+                                  lapack_complex_double alpha,
+                                  const lapack_complex_double* x,
+                                  lapack_int incx, lapack_complex_double* a,
+                                  lapack_int lda );
+
+
+
+#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF)
+#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF)
+#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF)
+#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF)
+#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF)
+#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF)
+#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF)
+#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF)
+#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF)
+#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF)
+#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF)
+#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF)
+#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF)
+#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF)
+#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF)
+#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF)
+#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF)
+#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF)
+#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF)
+#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF)
+#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF)
+#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF)
+#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF)
+#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF)
+#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF)
+#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF)
+#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF)
+#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF)
+#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF)
+#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF)
+#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF)
+#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF)
+#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF)
+#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF)
+#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF)
+#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF)
+#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF)
+#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF)
+#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF)
+#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF)
+#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF)
+#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF)
+#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF)
+#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF)
+#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF)
+#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF)
+#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF)
+#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF)
+#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS)
+#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS)
+#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS)
+#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS)
+#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS)
+#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS)
+#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS)
+#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS)
+#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS)
+#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS)
+#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS)
+#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS)
+#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS)
+#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS)
+#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS)
+#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS)
+#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS)
+#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS)
+#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS)
+#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS)
+#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS)
+#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS)
+#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS)
+#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS)
+#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS)
+#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS)
+#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS)
+#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS)
+#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS)
+#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS)
+#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS)
+#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS)
+#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS)
+#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS)
+#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS)
+#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS)
+#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS)
+#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS)
+#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS)
+#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS)
+#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS)
+#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS)
+#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS)
+#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS)
+#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS)
+#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS)
+#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS)
+#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS)
+#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS)
+#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS)
+#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS)
+#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS)
+#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS)
+#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS)
+#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS)
+#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS)
+#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON)
+#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON)
+#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON)
+#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON)
+#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON)
+#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON)
+#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON)
+#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON)
+#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON)
+#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON)
+#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON)
+#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON)
+#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON)
+#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON)
+#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON)
+#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON)
+#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON)
+#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON)
+#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON)
+#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON)
+#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON)
+#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON)
+#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON)
+#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON)
+#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON)
+#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON)
+#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON)
+#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON)
+#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON)
+#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON)
+#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON)
+#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON)
+#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON)
+#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON)
+#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON)
+#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON)
+#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON)
+#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON)
+#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON)
+#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON)
+#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON)
+#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON)
+#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON)
+#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON)
+#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON)
+#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON)
+#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON)
+#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON)
+#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON)
+#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON)
+#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON)
+#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON)
+#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS)
+#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS)
+#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS)
+#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS)
+#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX)
+#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX)
+#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX)
+#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX)
+#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS)
+#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS)
+#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS)
+#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS)
+#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX)
+#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX)
+#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX)
+#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX)
+#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS)
+#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS)
+#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS)
+#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS)
+#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS)
+#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS)
+#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS)
+#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS)
+#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX)
+#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX)
+#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX)
+#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX)
+#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS)
+#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS)
+#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS)
+#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS)
+#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS)
+#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS)
+#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS)
+#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS)
+#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS)
+#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS)
+#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS)
+#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS)
+#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS)
+#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS)
+#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS)
+#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS)
+#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX)
+#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX)
+#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX)
+#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX)
+#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS)
+#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS)
+#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX)
+#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX)
+#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS)
+#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS)
+#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS)
+#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS)
+#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS)
+#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS)
+#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS)
+#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS)
+#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS)
+#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS)
+#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS)
+#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS)
+#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS)
+#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS)
+#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS)
+#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS)
+#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS)
+#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS)
+#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI)
+#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI)
+#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI)
+#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI)
+#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI)
+#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI)
+#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI)
+#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI)
+#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI)
+#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI)
+#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI)
+#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI)
+#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI)
+#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI)
+#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI)
+#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI)
+#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI)
+#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI)
+#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI)
+#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI)
+#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI)
+#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI)
+#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI)
+#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI)
+#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI)
+#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI)
+#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI)
+#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI)
+#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI)
+#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI)
+#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI)
+#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI)
+#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI)
+#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI)
+#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI)
+#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI)
+#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI)
+#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI)
+#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI)
+#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI)
+#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU)
+#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU)
+#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU)
+#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU)
+#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB)
+#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB)
+#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB)
+#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB)
+#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU)
+#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU)
+#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU)
+#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU)
+#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB)
+#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB)
+#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB)
+#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB)
+#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU)
+#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU)
+#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU)
+#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU)
+#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB)
+#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB)
+#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB)
+#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB)
+#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU)
+#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU)
+#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU)
+#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU)
+#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU)
+#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU)
+#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU)
+#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU)
+#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB)
+#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB)
+#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB)
+#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB)
+#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB)
+#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB)
+#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV)
+#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV)
+#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
+#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV)
+#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV)
+#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV)
+#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX)
+#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX)
+#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX)
+#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX)
+#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX)
+#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX)
+#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX)
+#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX)
+#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV)
+#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV)
+#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV)
+#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV)
+#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX)
+#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX)
+#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX)
+#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX)
+#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX)
+#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX)
+#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX)
+#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX)
+#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV)
+#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV)
+#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV)
+#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV)
+#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX)
+#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX)
+#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX)
+#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX)
+#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV)
+#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV)
+#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV)
+#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV)
+#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV)
+#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV)
+#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX)
+#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX)
+#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX)
+#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX)
+#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX)
+#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX)
+#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX)
+#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX)
+#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV)
+#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV)
+#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV)
+#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV)
+#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX)
+#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX)
+#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX)
+#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX)
+#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV)
+#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV)
+#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV)
+#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV)
+#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX)
+#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX)
+#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX)
+#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX)
+#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV)
+#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV)
+#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV)
+#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV)
+#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX)
+#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX)
+#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX)
+#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX)
+#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV)
+#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV)
+#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV)
+#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV)
+#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX)
+#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX)
+#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX)
+#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX)
+#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX)
+#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX)
+#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX)
+#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX)
+#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV)
+#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV)
+#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX)
+#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX)
+#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX)
+#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX)
+#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV)
+#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV)
+#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV)
+#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV)
+#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX)
+#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX)
+#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX)
+#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX)
+#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV)
+#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV)
+#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX)
+#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX)
+#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF)
+#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF)
+#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF)
+#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF)
+#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF)
+#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF)
+#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF)
+#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF)
+#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3)
+#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3)
+#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3)
+#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3)
+#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR)
+#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR)
+#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR)
+#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR)
+#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR)
+#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR)
+#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR)
+#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR)
+#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF)
+#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF)
+#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF)
+#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF)
+#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ)
+#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ)
+#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ)
+#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ)
+#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ)
+#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ)
+#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ)
+#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ)
+#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF)
+#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF)
+#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF)
+#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF)
+#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL)
+#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL)
+#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL)
+#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL)
+#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL)
+#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL)
+#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL)
+#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL)
+#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF)
+#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF)
+#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF)
+#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF)
+#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ)
+#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ)
+#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ)
+#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ)
+#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ)
+#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ)
+#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ)
+#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ)
+#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF)
+#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF)
+#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF)
+#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF)
+#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ)
+#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ)
+#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ)
+#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ)
+#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF)
+#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF)
+#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF)
+#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF)
+#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF)
+#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF)
+#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF)
+#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF)
+#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD)
+#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD)
+#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD)
+#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD)
+#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD)
+#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD)
+#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD)
+#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD)
+#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR)
+#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR)
+#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR)
+#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
+#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR)
+#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR)
+#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
+#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR)
+#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR)
+#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR)
+#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR)
+#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR)
+#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC)
+#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC)
+#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD)
+#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD)
+#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR)
+#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR)
+#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR)
+#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR)
+#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD)
+#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD)
+#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR)
+#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR)
+#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR)
+#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR)
+#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD)
+#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD)
+#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR)
+#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR)
+#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR)
+#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR)
+#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD)
+#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD)
+#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR)
+#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR)
+#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR)
+#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR)
+#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD)
+#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD)
+#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD)
+#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD)
+#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF)
+#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF)
+#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR)
+#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR)
+#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR)
+#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR)
+#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR)
+#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR)
+#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR)
+#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR)
+#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC)
+#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC)
+#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC)
+#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC)
+#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR)
+#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR)
+#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR)
+#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR)
+#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR)
+#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR)
+#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR)
+#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR)
+#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ)
+#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ)
+#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN)
+#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN)
+#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN)
+#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN)
+#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA)
+#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA)
+#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST)
+#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST)
+#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST)
+#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
+#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST)
+#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST)
+#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST)
+#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST)
+#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST)
+#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST)
+#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST)
+#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST)
+#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF)
+#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF)
+#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF)
+#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF)
+#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD)
+#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD)
+#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD)
+#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD)
+#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR)
+#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR)
+#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR)
+#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR)
+#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR)
+#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR)
+#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR)
+#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR)
+#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL)
+#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL)
+#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL)
+#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL)
+#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK)
+#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK)
+#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK)
+#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK)
+#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR)
+#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR)
+#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR)
+#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR)
+#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN)
+#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN)
+#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN)
+#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN)
+#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC)
+#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC)
+#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC)
+#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC)
+#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA)
+#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA)
+#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA)
+#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA)
+#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC)
+#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC)
+#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC)
+#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC)
+#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN)
+#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN)
+#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN)
+#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN)
+#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL)
+#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL)
+#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL)
+#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL)
+#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD)
+#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD)
+#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD)
+#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD)
+#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL)
+#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL)
+#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL)
+#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL)
+#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK)
+#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK)
+#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
+#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK)
+#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ)
+#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ)
+#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ)
+#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ)
+#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC)
+#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC)
+#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC)
+#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC)
+#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC)
+#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC)
+#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC)
+#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC)
+#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN)
+#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN)
+#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN)
+#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN)
+#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL)
+#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL)
+#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL)
+#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL)
+#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA)
+#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA)
+#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA)
+#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA)
+#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
+#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
+#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
+#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
+#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA)
+#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA)
+#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA)
+#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA)
+#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS)
+#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS)
+#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS)
+#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS)
+#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY)
+#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY)
+#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY)
+#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY)
+#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS)
+#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS)
+#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS)
+#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS)
+#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD)
+#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD)
+#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD)
+#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD)
+#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE)
+#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE)
+#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE)
+#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE)
+#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM)
+#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM)
+#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM)
+#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM)
+#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV)
+#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV)
+#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV)
+#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV)
+#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD)
+#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD)
+#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD)
+#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD)
+#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX)
+#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX)
+#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX)
+#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX)
+#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR)
+#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR)
+#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR)
+#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR)
+#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV)
+#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV)
+#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV)
+#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV)
+#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD)
+#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD)
+#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD)
+#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD)
+#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX)
+#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX)
+#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX)
+#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX)
+#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV)
+#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV)
+#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV)
+#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV)
+#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD)
+#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD)
+#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD)
+#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD)
+#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX)
+#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX)
+#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX)
+#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX)
+#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV)
+#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV)
+#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD)
+#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD)
+#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX)
+#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX)
+#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR)
+#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR)
+#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES)
+#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES)
+#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES)
+#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES)
+#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX)
+#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX)
+#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX)
+#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX)
+#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV)
+#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV)
+#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV)
+#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV)
+#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX)
+#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX)
+#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX)
+#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX)
+#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD)
+#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD)
+#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD)
+#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD)
+#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD)
+#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD)
+#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD)
+#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD)
+#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV)
+#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV)
+#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ)
+#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ)
+#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
+#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
+#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
+#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
+#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV)
+#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV)
+#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
+#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV)
+#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD)
+#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD)
+#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD)
+#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD)
+#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX)
+#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX)
+#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX)
+#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX)
+#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV)
+#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV)
+#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV)
+#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV)
+#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD)
+#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD)
+#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD)
+#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD)
+#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX)
+#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX)
+#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX)
+#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX)
+#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV)
+#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV)
+#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV)
+#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV)
+#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD)
+#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD)
+#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD)
+#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD)
+#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX)
+#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX)
+#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX)
+#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX)
+#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES)
+#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES)
+#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES)
+#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES)
+#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX)
+#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX)
+#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX)
+#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX)
+#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV)
+#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV)
+#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV)
+#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV)
+#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX)
+#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX)
+#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX)
+#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX)
+#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK)
+#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK)
+#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK)
+#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK)
+#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM)
+#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM)
+#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM)
+#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM)
+#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP)
+#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP)
+#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP)
+#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP)
+#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR)
+#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR)
+#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR)
+#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR)
+#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF)
+#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF)
+#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF)
+#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF)
+#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR)
+#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR)
+#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR)
+#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR)
+#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF)
+#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF)
+#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF)
+#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF)
+#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP)
+#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP)
+#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP)
+#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP)
+#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP)
+#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP)
+#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP)
+#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP)
+#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV)
+#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV)
+#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV)
+#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV)
+#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV)
+#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV)
+#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2)
+#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2)
+#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2)
+#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2)
+#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY)
+#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY)
+#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY)
+#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY)
+#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2)
+#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2)
+#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2)
+#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2)
+#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP)
+#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP)
+#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP)
+#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP)
+#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE)
+#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE)
+#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE)
+#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE)
+#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE)
+#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE)
+#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY)
+#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY)
+#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY)
+#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY)
+#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR)
+#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR)
+#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR)
+#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR)
+#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH)
+#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH)
+#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2)
+#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2)
+#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2)
+#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2)
+#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB)
+#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB)
+#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB)
+#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB)
+#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG)
+#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG)
+#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG)
+#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG)
+#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT)
+#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT)
+#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT)
+#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT)
+#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX)
+#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX)
+#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX)
+#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX)
+#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS)
+#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS)
+#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS)
+#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS)
+#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D)
+#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S)
+#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z)
+#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C)
+#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM)
+#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM)
+#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM)
+#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM)
+#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE)
+#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE)
+#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE)
+#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE)
+#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET)
+#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET)
+#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET)
+#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET)
+#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT)
+#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT)
+#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY)
+#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY)
+#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY)
+#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY)
+#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE)
+#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE)
+#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR)
+#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR)
+#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR)
+#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR)
+#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2)
+#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2)
+#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3)
+#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3)
+#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP)
+#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP)
+#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS)
+#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS)
+// LAPACK 3.3.0
+#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD)
+#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR)
+#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2)
+#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X)
+#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2)
+#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV)
+#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR)
+#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2)
+#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X)
+#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2)
+#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB)
+#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD)
+#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD)
+#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB)
+#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD)
+#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV)
+#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR)
+#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2)
+#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X)
+#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2)
+#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD)
+#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB)
+#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD)
+#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV)
+#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR)
+#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2)
+#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X)
+#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2)
+#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD)
+#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR)
+#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2)
+#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X)
+#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2)
+#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV)
+#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR)
+#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2)
+#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X)
+#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2)
+#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB)
+#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD)
+// LAPACK 3.4.0
+#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT)
+#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT)
+#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT)
+#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT)
+#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT)
+#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT)
+#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT)
+#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT)
+#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2)
+#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2)
+#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2)
+#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2)
+#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3)
+#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3)
+#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3)
+#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3)
+#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT)
+#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT)
+#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT)
+#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT)
+#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT)
+#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT)
+#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT)
+#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2)
+#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2)
+#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2)
+#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2)
+#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB)
+#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB)
+#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB)
+#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB)
+// LAPACK 3.X.X
+#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR)
+#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR)
+
+
+void LAPACK_sgetrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgttrf( lapack_int* n, float* dl, float* d, float* du, float* du2,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgttrf( lapack_int* n, double* dl, double* d, double* du,
+                    double* du2, lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgttrf( lapack_int* n, lapack_complex_float* dl,
+                    lapack_complex_float* d, lapack_complex_float* du,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_zgttrf( lapack_int* n, lapack_complex_double* dl,
+                    lapack_complex_double* d, lapack_complex_double* du,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_spotrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpstrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, double* tol,
+                    double* work, lapack_int *info );
+void LAPACK_spstrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* piv, lapack_int* rank, float* tol, float* work,
+                    lapack_int *info );
+void LAPACK_zpstrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    double* tol, double* work, lapack_int *info );
+void LAPACK_cpstrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                    float* tol, float* work, lapack_int *info );
+void LAPACK_dpftrf( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftrf( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftrf( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptrf( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptrf( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_spbtrf( char* uplo, lapack_int* n, lapack_int* kd, float* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_dpbtrf( char* uplo, lapack_int* n, lapack_int* kd, double* ab,
+                    lapack_int* ldab, lapack_int *info );
+void LAPACK_cpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_zpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_int *info );
+void LAPACK_spttrf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dpttrf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_cpttrf( lapack_int* n, float* d, lapack_complex_float* e,
+                    lapack_int *info );
+void LAPACK_zpttrf( lapack_int* n, double* d, lapack_complex_double* e,
+                    lapack_int *info );
+void LAPACK_ssytrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dsytrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_csytrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zsytrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chetrf( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zhetrf( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrf( char* uplo, lapack_int* n, float* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_dsptrf( char* uplo, lapack_int* n, double* ap, lapack_int* ipiv,
+                    lapack_int *info );
+void LAPACK_csptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zsptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_chptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_zhptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_sgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const lapack_int* ipiv,
+                    float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* du2, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* du2, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spotrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_spbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_cpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_spttrs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dpttrs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_cpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zpttrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ssytrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_csytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_chetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_zhetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ssptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const lapack_int* ipiv, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const lapack_int* ipiv, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_csptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, const lapack_int* ipiv,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, const lapack_int* ipiv,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_strtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_ctrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, float* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_dtptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, double* b,
+                    lapack_int* ldb, lapack_int *info );
+void LAPACK_ctptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztptrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_stbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dtbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ctbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_ztbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sgecon( char* norm, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgecon( char* norm, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgecon( char* norm, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgecon( char* norm, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const float* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const double* ab, lapack_int* ldab, const lapack_int* ipiv,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgtcon( char* norm, lapack_int* n, const float* dl, const float* d,
+                    const float* du, const float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtcon( char* norm, lapack_int* n, const double* dl,
+                    const double* d, const double* du, const double* du2,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgtcon( char* norm, lapack_int* n, const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zgtcon( char* norm, lapack_int* n, const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_spocon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* anorm, float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpocon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cpocon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* anorm, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpocon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* anorm, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sppcon( char* uplo, lapack_int* n, const float* ap, float* anorm,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppcon( char* uplo, lapack_int* n, const double* ap, double* anorm,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zppcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_spbcon( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* anorm, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbcon( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    float* anorm, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbcon( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* anorm, double* rcond, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptcon( lapack_int* n, const float* d, const float* e, float* anorm,
+                    float* rcond, float* work, lapack_int *info );
+void LAPACK_dptcon( lapack_int* n, const double* d, const double* e,
+                    double* anorm, double* rcond, double* work,
+                    lapack_int *info );
+void LAPACK_cptcon( lapack_int* n, const float* d,
+                    const lapack_complex_float* e, float* anorm, float* rcond,
+                    float* work, lapack_int *info );
+void LAPACK_zptcon( lapack_int* n, const double* d,
+                    const lapack_complex_double* e, double* anorm,
+                    double* rcond, double* work, lapack_int *info );
+void LAPACK_ssycon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsycon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_csycon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsycon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_checon( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                    float* rcond, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhecon( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                    double* rcond, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_sspcon( char* uplo, lapack_int* n, const float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dspcon( char* uplo, lapack_int* n, const double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zspcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chpcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_int* ipiv, float* anorm, float* rcond,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhpcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_int* ipiv, double* anorm, double* rcond,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_strcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* a, lapack_int* lda, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* a, lapack_int* lda, double* rcond,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    float* rcond, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    double* rcond, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const float* ap, float* rcond, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const double* ap, double* rcond, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_float* ap, float* rcond,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztpcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    const lapack_complex_double* ap, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const float* ab, lapack_int* ldab,
+                    float* rcond, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const double* ab, lapack_int* ldab,
+                    double* rcond, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* rcond, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_ztbcon( char* norm, char* uplo, char* diag, lapack_int* n,
+                    lapack_int* kd, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* rcond,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, const float* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* r,
+                     const double* c, const double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* r,
+                     const float* c, const float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                    const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                    const double* afb, lapack_int* ldafb,
+                    const lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_float* ab,
+                    lapack_int* ldab, const lapack_complex_float* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
+                    lapack_int* nrhs, const lapack_complex_double* ab,
+                    lapack_int* ldab, const lapack_complex_double* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const double* ab,
+                     lapack_int* ldab, const double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, const float* ab,
+                     lapack_int* ldab, const float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_double* ab, lapack_int* ldab,
+                     const lapack_complex_double* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const double* r, const double* c,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     const lapack_complex_float* ab, lapack_int* ldab,
+                     const lapack_complex_float* afb, lapack_int* ldafb,
+                     const lapack_int* ipiv, const float* r, const float* c,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    const float* dlf, const float* df, const float* duf,
+                    const float* du2, const lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    const double* dlf, const double* df, const double* duf,
+                    const double* du2, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du,
+                    const lapack_complex_float* dlf,
+                    const lapack_complex_float* df,
+                    const lapack_complex_float* duf,
+                    const lapack_complex_float* du2, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du,
+                    const lapack_complex_double* dlf,
+                    const lapack_complex_double* df,
+                    const lapack_complex_double* duf,
+                    const lapack_complex_double* du2, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sporfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_dporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const double* s, const double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const float* s, const float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const double* s, const lapack_complex_double* b,
+                     lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                     double* rcond, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const float* s, const lapack_complex_float* b,
+                     lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_spprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                    float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, const float* afb,
+                    lapack_int* ldafb, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, const double* afb,
+                    lapack_int* ldafb, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* afb, lapack_int* ldafb,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* afb, lapack_int* ldafb,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sptrfs( lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, const float* df, const float* ef,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptrfs( lapack_int* n, lapack_int* nrhs, const double* d,
+                    const double* e, const double* df, const double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int *info );
+void LAPACK_cptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, const float* df,
+                    const lapack_complex_float* ef,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e,
+                    const double* df, const lapack_complex_double* ef,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
+                    lapack_int* lda, const float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, const double* af,
+                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_csyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const double* a, lapack_int* lda, const double* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const double* s,
+                     const double* b, lapack_int* ldb, double* x,
+                     lapack_int* ldx, double* rcond, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_ssyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const float* a, lapack_int* lda, const float* af,
+                     lapack_int* ldaf, const lapack_int* ipiv, const float* s,
+                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_cherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* af, lapack_int* ldaf,
+                    const lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_zherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_complex_double* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const double* s,
+                     const lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
+                     const lapack_complex_float* a, lapack_int* lda,
+                     const lapack_complex_float* af, lapack_int* ldaf,
+                     const lapack_int* ipiv, const float* s,
+                     const lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_ssprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, const float* afp, const lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, const double* afp, const lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap,
+                    const lapack_complex_float* afp, const lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap,
+                    const lapack_complex_double* afp, const lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
+                    double* berr, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* x,
+                    lapack_int* ldx, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* x,
+                    lapack_int* ldx, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* x,
+                    lapack_int* ldx, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* x,
+                    lapack_int* ldx, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const float* ap, const float* b,
+                    lapack_int* ldb, const float* x, lapack_int* ldx,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const double* ap, const double* b,
+                    lapack_int* ldb, const double* x, lapack_int* ldx,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_float* ap,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztprfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* nrhs, const lapack_complex_double* ap,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_stbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const float* ab,
+                    lapack_int* ldab, const float* b, lapack_int* ldb,
+                    const float* x, lapack_int* ldx, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs, const double* ab,
+                    lapack_int* ldab, const double* b, lapack_int* ldb,
+                    const double* x, lapack_int* ldx, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
+                    float* berr, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
+                    lapack_int* kd, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    const lapack_complex_double* x, lapack_int* ldx,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgetri( lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgetri( lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgetri( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgetri( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_spotri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dpotri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_cpotri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zpotri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dpftri( char* transr, char* uplo, lapack_int* n, double* a,
+                    lapack_int *info );
+void LAPACK_spftri( char* transr, char* uplo, lapack_int* n, float* a,
+                    lapack_int *info );
+void LAPACK_zpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_cpftri( char* transr, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_spptri( char* uplo, lapack_int* n, float* ap, lapack_int *info );
+void LAPACK_dpptri( char* uplo, lapack_int* n, double* ap, lapack_int *info );
+void LAPACK_cpptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_zpptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ssytri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsytri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csytri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zsytri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_chetri( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhetri( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssptri( char* uplo, lapack_int* n, float* ap,
+                    const lapack_int* ipiv, float* work, lapack_int *info );
+void LAPACK_dsptri( char* uplo, lapack_int* n, double* ap,
+                    const lapack_int* ipiv, double* work, lapack_int *info );
+void LAPACK_csptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zsptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_chptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    const lapack_int* ipiv, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zhptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    const lapack_int* ipiv, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_strtri( char* uplo, char* diag, lapack_int* n, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtrtri( char* uplo, char* diag, lapack_int* n, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ztrtri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    double* a, lapack_int *info );
+void LAPACK_stftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    float* a, lapack_int *info );
+void LAPACK_ztftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* a, lapack_int *info );
+void LAPACK_ctftri( char* transr, char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* a, lapack_int *info );
+void LAPACK_stptri( char* uplo, char* diag, lapack_int* n, float* ap,
+                    lapack_int *info );
+void LAPACK_dtptri( char* uplo, char* diag, lapack_int* n, double* ap,
+                    lapack_int *info );
+void LAPACK_ctptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_float* ap, lapack_int *info );
+void LAPACK_ztptri( char* uplo, char* diag, lapack_int* n,
+                    lapack_complex_double* ap, lapack_int *info );
+void LAPACK_sgeequ( lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_dgeequ( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgeequ( lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequ( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* r,
+                    double* c, double* rowcnd, double* colcnd, double* amax,
+                    lapack_int *info );
+void LAPACK_dgeequb( lapack_int* m, lapack_int* n, const double* a,
+                     lapack_int* lda, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_sgeequb( lapack_int* m, lapack_int* n, const float* a,
+                     lapack_int* lda, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_double* a, lapack_int* lda, double* r,
+                     double* c, double* rowcnd, double* colcnd, double* amax,
+                     lapack_int *info );
+void LAPACK_cgeequb( lapack_int* m, lapack_int* n,
+                     const lapack_complex_float* a, lapack_int* lda, float* r,
+                     float* c, float* rowcnd, float* colcnd, float* amax,
+                     lapack_int *info );
+void LAPACK_sgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* ab, lapack_int* ldab, float* r,
+                    float* c, float* rowcnd, float* colcnd, float* amax,
+                    lapack_int *info );
+void LAPACK_dgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* ab, lapack_int* ldab,
+                    double* r, double* c, double* rowcnd, double* colcnd,
+                    double* amax, lapack_int *info );
+void LAPACK_cgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int *info );
+void LAPACK_zgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* r, double* c, double* rowcnd,
+                    double* colcnd, double* amax, lapack_int *info );
+void LAPACK_dgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const double* ab, lapack_int* ldab,
+                     double* r, double* c, double* rowcnd, double* colcnd,
+                     double* amax, lapack_int *info );
+void LAPACK_sgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const float* ab, lapack_int* ldab,
+                     float* r, float* c, float* rowcnd, float* colcnd,
+                     float* amax, lapack_int *info );
+void LAPACK_zgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_double* ab,
+                     lapack_int* ldab, double* r, double* c, double* rowcnd,
+                     double* colcnd, double* amax, lapack_int *info );
+void LAPACK_cgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, const lapack_complex_float* ab,
+                     lapack_int* ldab, float* r, float* c, float* rowcnd,
+                     float* colcnd, float* amax, lapack_int *info );
+void LAPACK_spoequ( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dpoequ( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cpoequ( lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_zpoequ( lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_dpoequb( lapack_int* n, const double* a, lapack_int* lda, double* s,
+                     double* scond, double* amax, lapack_int *info );
+void LAPACK_spoequb( lapack_int* n, const float* a, lapack_int* lda, float* s,
+                     float* scond, float* amax, lapack_int *info );
+void LAPACK_zpoequb( lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_int *info );
+void LAPACK_cpoequb( lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_int *info );
+void LAPACK_sppequ( char* uplo, lapack_int* n, const float* ap, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_dppequ( char* uplo, lapack_int* n, const double* ap, double* s,
+                    double* scond, double* amax, lapack_int *info );
+void LAPACK_cppequ( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    float* s, float* scond, float* amax, lapack_int *info );
+void LAPACK_zppequ( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_spbequ( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
+                    lapack_int* ldab, float* s, float* scond, float* amax,
+                    lapack_int *info );
+void LAPACK_dpbequ( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
+                    lapack_int* ldab, double* s, double* scond, double* amax,
+                    lapack_int *info );
+void LAPACK_cpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_float* ab, lapack_int* ldab, float* s,
+                    float* scond, float* amax, lapack_int *info );
+void LAPACK_zpbequ( char* uplo, lapack_int* n, lapack_int* kd,
+                    const lapack_complex_double* ab, lapack_int* ldab,
+                    double* s, double* scond, double* amax, lapack_int *info );
+void LAPACK_dsyequb( char* uplo, lapack_int* n, const double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     double* work, lapack_int *info );
+void LAPACK_ssyequb( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                     float* s, float* scond, float* amax, float* work,
+                     lapack_int *info );
+void LAPACK_zsyequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_csyequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zheequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* s, double* scond, double* amax,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_cheequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                     lapack_int* lda, float* s, float* scond, float* amax,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_sgesv( lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                   lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_dsgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
+                    lapack_int* ipiv, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, float* r, float* c,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, double* ab, lapack_int* ldab,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_zgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
+                   lapack_int* nrhs, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, float* ab,
+                    lapack_int* ldab, float* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, double* ab,
+                    lapack_int* ldab, double* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, double* r, double* c,
+                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                    float* c, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
+                    double* c, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, double* ab,
+                     lapack_int* ldab, double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                     double* rcond, double* rpvgrw, double* berr,
+                     lapack_int* n_err_bnds, double* err_bnds_norm,
+                     double* err_bnds_comp, lapack_int* nparams, double* params,
+                     double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, float* ab,
+                     lapack_int* ldab, float* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, float* r, float* c,
+                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                     float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs,
+                     lapack_complex_double* ab, lapack_int* ldab,
+                     lapack_complex_double* afb, lapack_int* ldafb,
+                     lapack_int* ipiv, char* equed, double* r, double* c,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
+                     lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                     lapack_int* ldab, lapack_complex_float* afb,
+                     lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
+                     float* c, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sgtsv( lapack_int* n, lapack_int* nrhs, float* dl, float* d,
+                   float* du, float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dgtsv( lapack_int* n, lapack_int* nrhs, double* dl, double* d,
+                   double* du, double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl,
+                   lapack_complex_float* d, lapack_complex_float* du,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl,
+                   lapack_complex_double* d, lapack_complex_double* du,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const float* dl, const float* d, const float* du,
+                    float* dlf, float* df, float* duf, float* du2,
+                    lapack_int* ipiv, const float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const double* dl, const double* d, const double* du,
+                    double* dlf, double* df, double* duf, double* du2,
+                    lapack_int* ipiv, const double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* dl,
+                    const lapack_complex_float* d,
+                    const lapack_complex_float* du, lapack_complex_float* dlf,
+                    lapack_complex_float* df, lapack_complex_float* duf,
+                    lapack_complex_float* du2, lapack_int* ipiv,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* dl,
+                    const lapack_complex_double* d,
+                    const lapack_complex_double* du, lapack_complex_double* dlf,
+                    lapack_complex_double* df, lapack_complex_double* duf,
+                    lapack_complex_double* du2, lapack_int* ipiv,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sposv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dsposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* work, float* swork,
+                    lapack_int* iter, lapack_int *info );
+void LAPACK_zcposv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, lapack_complex_float* swork,
+                    double* rwork, lapack_int* iter, lapack_int *info );
+void LAPACK_sposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                    char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                    char* equed, double* s, double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* ferr,
+                    double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                    float* s, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                    double* s, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_dposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     char* equed, double* s, double* b, lapack_int* ldb,
+                     double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
+                     double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_sposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                     lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                     lapack_int* n_err_bnds, float* err_bnds_norm,
+                     float* err_bnds_comp, lapack_int* nparams, float* params,
+                     float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_zposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf, char* equed,
+                     double* s, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_cposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf, char* equed,
+                     float* s, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sppsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dppsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    float* ap, float* afp, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    double* ap, double* afp, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* ap, lapack_complex_float* afp,
+                    char* equed, float* s, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* ap, lapack_complex_double* afp,
+                    char* equed, double* s, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   float* ab, lapack_int* ldab, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   double* ab, lapack_int* ldab, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_spbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, float* ab, lapack_int* ldab, float* afb,
+                    lapack_int* ldafb, char* equed, float* s, float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, double* ab, lapack_int* ldab, double* afb,
+                    lapack_int* ldafb, char* equed, double* s, double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_cpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* afb,
+                    lapack_int* ldafb, char* equed, float* s,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_int* nrhs, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* afb,
+                    lapack_int* ldafb, char* equed, double* s,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sptsv( lapack_int* n, lapack_int* nrhs, float* d, float* e,
+                   float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_dptsv( lapack_int* n, lapack_int* nrhs, double* d, double* e,
+                   double* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_cptsv( lapack_int* n, lapack_int* nrhs, float* d,
+                   lapack_complex_float* e, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_zptsv( lapack_int* n, lapack_int* nrhs, double* d,
+                   lapack_complex_double* e, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int *info );
+void LAPACK_sptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const float* e, float* df, float* ef, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int *info );
+void LAPACK_dptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const double* e, double* df, double* ef,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int *info );
+void LAPACK_cptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
+                    const lapack_complex_float* e, float* df,
+                    lapack_complex_float* ef, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
+                    const double* d, const lapack_complex_double* e, double* df,
+                    lapack_complex_double* ef, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_ssysv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
+                   lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsysv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
+                   lapack_int* lda, lapack_int* ipiv, double* b,
+                   lapack_int* ldb, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_csysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zsysv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_ssysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                    float* ferr, float* berr, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, lapack_int* ipiv, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* ferr, double* berr, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_csysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_dsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s, double* b,
+                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params, double* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_ssysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s, float* b,
+                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params, float* work,
+                     lapack_int* iwork, lapack_int *info );
+void LAPACK_zsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_csysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_chesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zhesv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_zhesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, double* s,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                     double* err_bnds_norm, double* err_bnds_comp,
+                     lapack_int* nparams, double* params,
+                     lapack_complex_double* work, double* rwork,
+                     lapack_int *info );
+void LAPACK_chesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* af, lapack_int* ldaf,
+                     lapack_int* ipiv, char* equed, float* s,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                     float* err_bnds_norm, float* err_bnds_comp,
+                     lapack_int* nparams, float* params,
+                     lapack_complex_float* work, float* rwork,
+                     lapack_int *info );
+void LAPACK_sspsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
+                   lapack_int* ipiv, float* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_dspsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
+                   lapack_int* ipiv, double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_cspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_sspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const float* ap, float* afp, lapack_int* ipiv,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr, float* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const double* ap, double* afp, lapack_int* ipiv,
+                    const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_chpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* ap, lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
+void LAPACK_zhpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* ap, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int *info );
+void LAPACK_chpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_float* ap, lapack_complex_float* afp,
+                    lapack_int* ipiv, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* ferr, float* berr,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
+                    const lapack_complex_double* ap, lapack_complex_double* afp,
+                    lapack_int* ipiv, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* ferr, double* berr,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sgeqrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgeqp3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeqp3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeqp3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgeqp3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sorgqr( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgqr( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungqr( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgelqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgelqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgelqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgelqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorglq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorglq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunglq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgeqlf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgeqlf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgeqlf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgeqlf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgql( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgql( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungql( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgerqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgerqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgerqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgerqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sorgrq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgrq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungrq( lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_stzrzf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dtzrzf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ctzrzf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztzrzf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggqrf( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggqrf( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sggrqf( lapack_int* m, lapack_int* p, lapack_int* n, float* a,
+                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
+                    float* taub, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggrqf( lapack_int* m, lapack_int* p, lapack_int* n, double* a,
+                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
+                    double* taub, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* taua, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* taub,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* taua, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* taub,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgebrd( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tauq, float* taup, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgebrd( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tauq, double* taup,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgebrd( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tauq, lapack_complex_float* taup,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zgebrd( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tauq, lapack_complex_double* taup,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
+                    float* d, float* e, float* q, lapack_int* ldq, float* pt,
+                    lapack_int* ldpt, float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_dgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, double* ab,
+                    lapack_int* ldab, double* d, double* e, double* q,
+                    lapack_int* ldq, double* pt, lapack_int* ldpt, double* c,
+                    lapack_int* ldc, double* work, lapack_int *info );
+void LAPACK_cgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
+                    lapack_int* ldab, float* d, float* e,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* pt, lapack_int* ldpt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
+                    lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
+                    lapack_int* ldab, double* d, double* e,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* pt, lapack_int* ldpt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_sorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    float* a, lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    double* a, lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmbr( char* vect, char* side, char* trans, lapack_int* m,
+                    lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    float* vt, lapack_int* ldvt, float* u, lapack_int* ldu,
+                    float* c, lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    double* vt, lapack_int* ldvt, double* u, lapack_int* ldu,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_cbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* c, lapack_int* ldc, float* work,
+                    lapack_int *info );
+void LAPACK_zbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
+                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_sbdsdc( char* uplo, char* compq, lapack_int* n, float* d, float* e,
+                    float* u, lapack_int* ldu, float* vt, lapack_int* ldvt,
+                    float* q, lapack_int* iq, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dbdsdc( char* uplo, char* compq, lapack_int* n, double* d,
+                    double* e, double* u, lapack_int* ldu, double* vt,
+                    lapack_int* ldvt, double* q, lapack_int* iq, double* work,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssytrd( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    float* d, float* e, float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dsytrd( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    double* d, double* e, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorgtr( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    const float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dorgtr( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    const double* tau, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_sormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* tau, float* c, lapack_int* ldc, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dormtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* tau, double* c, lapack_int* ldc, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chetrd( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, float* d, float* e,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhetrd( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, double* d, double* e,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cungtr( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zungtr( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_zunmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ssptrd( char* uplo, lapack_int* n, float* ap, float* d, float* e,
+                    float* tau, lapack_int *info );
+void LAPACK_dsptrd( char* uplo, lapack_int* n, double* ap, double* d, double* e,
+                    double* tau, lapack_int *info );
+void LAPACK_sopgtr( char* uplo, lapack_int* n, const float* ap,
+                    const float* tau, float* q, lapack_int* ldq, float* work,
+                    lapack_int *info );
+void LAPACK_dopgtr( char* uplo, lapack_int* n, const double* ap,
+                    const double* tau, double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_sopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const float* ap, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dopmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const double* ap, const double* tau,
+                    double* c, lapack_int* ldc, double* work,
+                    lapack_int *info );
+void LAPACK_chptrd( char* uplo, lapack_int* n, lapack_complex_float* ap,
+                    float* d, float* e, lapack_complex_float* tau,
+                    lapack_int *info );
+void LAPACK_zhptrd( char* uplo, lapack_int* n, lapack_complex_double* ap,
+                    double* d, double* e, lapack_complex_double* tau,
+                    lapack_int *info );
+void LAPACK_cupgtr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* q,
+                    lapack_int* ldq, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupgtr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* q,
+                    lapack_int* ldq, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_cupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* ap,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int *info );
+void LAPACK_zupmtr( char* side, char* uplo, char* trans, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* ap,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_ssbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* d, float* e, float* q,
+                    lapack_int* ldq, float* work, lapack_int *info );
+void LAPACK_dsbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* d, double* e,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int *info );
+void LAPACK_chbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* d,
+                    float* e, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zhbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* d,
+                    double* e, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_ssterf( lapack_int* n, float* d, float* e, lapack_int *info );
+void LAPACK_dsterf( lapack_int* n, double* d, double* e, lapack_int *info );
+void LAPACK_ssteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_csteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zsteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* nzc, lapack_int* isuppz, lapack_logical* tryrac,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w, double* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
+                    lapack_logical* tryrac, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstemr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
+                    lapack_int* isuppz, lapack_logical* tryrac, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_sstedc( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstedc( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cstedc( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zstedc( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_cstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zstegr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_spteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dpteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_cpteqr( char* compz, lapack_int* n, float* d, float* e,
+                    lapack_complex_float* z, lapack_int* ldz, float* work,
+                    lapack_int *info );
+void LAPACK_zpteqr( char* compz, lapack_int* n, double* d, double* e,
+                    lapack_complex_double* z, lapack_int* ldz, double* work,
+                    lapack_int *info );
+void LAPACK_sstebz( char* range, char* order, lapack_int* n, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    const float* d, const float* e, lapack_int* m,
+                    lapack_int* nsplit, float* w, lapack_int* iblock,
+                    lapack_int* isplit, float* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dstebz( char* range, char* order, lapack_int* n, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    const double* d, const double* e, lapack_int* m,
+                    lapack_int* nsplit, double* w, lapack_int* iblock,
+                    lapack_int* isplit, double* work, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_sstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_dstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifailv,
+                    lapack_int *info );
+void LAPACK_cstein( lapack_int* n, const float* d, const float* e,
+                    lapack_int* m, const float* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_zstein( lapack_int* n, const double* d, const double* e,
+                    lapack_int* m, const double* w, const lapack_int* iblock,
+                    const lapack_int* isplit, lapack_complex_double* z,
+                    lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifailv, lapack_int *info );
+void LAPACK_sdisna( char* job, lapack_int* m, lapack_int* n, const float* d,
+                    float* sep, lapack_int *info );
+void LAPACK_ddisna( char* job, lapack_int* m, lapack_int* n, const double* d,
+                    double* sep, lapack_int *info );
+void LAPACK_ssygst( lapack_int* itype, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, const float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_dsygst( lapack_int* itype, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, const double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_chegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_zhegst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int *info );
+void LAPACK_sspgst( lapack_int* itype, char* uplo, lapack_int* n, float* ap,
+                    const float* bp, lapack_int *info );
+void LAPACK_dspgst( lapack_int* itype, char* uplo, lapack_int* n, double* ap,
+                    const double* bp, lapack_int *info );
+void LAPACK_chpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, const lapack_complex_float* bp,
+                    lapack_int *info );
+void LAPACK_zhpgst( lapack_int* itype, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, const lapack_complex_double* bp,
+                    lapack_int *info );
+void LAPACK_ssbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab,
+                    const float* bb, lapack_int* ldbb, float* x,
+                    lapack_int* ldx, float* work, lapack_int *info );
+void LAPACK_dsbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab,
+                    const double* bb, lapack_int* ldbb, double* x,
+                    lapack_int* ldx, double* work, lapack_int *info );
+void LAPACK_chbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    const lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_complex_float* x, lapack_int* ldx,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    const lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_complex_double* x, lapack_int* ldx,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_spbstf( char* uplo, lapack_int* n, lapack_int* kb, float* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_dpbstf( char* uplo, lapack_int* n, lapack_int* kb, double* bb,
+                    lapack_int* ldbb, lapack_int *info );
+void LAPACK_cpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_float* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_zpbstf( char* uplo, lapack_int* n, lapack_int* kb,
+                    lapack_complex_double* bb, lapack_int* ldbb,
+                    lapack_int *info );
+void LAPACK_sgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
+                    lapack_int* lda, const float* tau, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                    lapack_int* lda, const double* tau, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const float* a,
+                    lapack_int* lda, const float* tau, float* c,
+                    lapack_int* ldc, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, const double* a,
+                    lapack_int* lda, const double* tau, double* c,
+                    lapack_int* ldc, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                    lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* tau, lapack_complex_float* c,
+                    lapack_int* ldc, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* tau, lapack_complex_double* c,
+                    lapack_int* ldc, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgebal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, float* scale,
+                    lapack_int *info );
+void LAPACK_dgebal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ilo, lapack_int* ihi, double* scale,
+                    lapack_int *info );
+void LAPACK_cgebal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, lapack_int *info );
+void LAPACK_zgebal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, lapack_int *info );
+void LAPACK_sgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    float* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_dgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    double* v, lapack_int* ldv, lapack_int *info );
+void LAPACK_cgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* scale, lapack_int* m,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* scale, lapack_int* m,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* h, lapack_int* ldh, float* wr,
+                    float* wi, float* z, lapack_int* ldz, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* h, lapack_int* ldh, double* wr,
+                    double* wi, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_chseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_shsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const float* h,
+                    lapack_int* ldh, float* wr, const float* wi, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_dhsein( char* job, char* eigsrc, char* initv,
+                    lapack_logical* select, lapack_int* n, const double* h,
+                    lapack_int* ldh, double* wr, const double* wi, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_chsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_float* h, lapack_int* ldh,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_float* work, float* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_zhsein( char* job, char* eigsrc, char* initv,
+                    const lapack_logical* select, lapack_int* n,
+                    const lapack_complex_double* h, lapack_int* ldh,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
+void LAPACK_strevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtrevc( char* side, char* howmny, lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztrevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_strsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* t, lapack_int* ldt,
+                    const float* vl, lapack_int* ldvl, const float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, float* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dtrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* t, lapack_int* ldt,
+                    const double* vl, lapack_int* ldvl, const double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, double* work, lapack_int* ldwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* t,
+                    lapack_int* ldt, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* ldwork, float* rwork, lapack_int *info );
+void LAPACK_ztrsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* t,
+                    lapack_int* ldt, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* ldwork, double* rwork, lapack_int *info );
+void LAPACK_strexc( char* compq, lapack_int* n, float* t, lapack_int* ldt,
+                    float* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, float* work, lapack_int *info );
+void LAPACK_dtrexc( char* compq, lapack_int* n, double* t, lapack_int* ldt,
+                    double* q, lapack_int* ldq, lapack_int* ifst,
+                    lapack_int* ilst, double* work, lapack_int *info );
+void LAPACK_ctrexc( char* compq, lapack_int* n, lapack_complex_float* t,
+                    lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_ztrexc( char* compq, lapack_int* n, lapack_complex_double* t,
+                    lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
+void LAPACK_strsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, float* t, lapack_int* ldt, float* q,
+                    lapack_int* ldq, float* wr, float* wi, lapack_int* m,
+                    float* s, float* sep, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dtrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, double* t, lapack_int* ldt, double* q,
+                    lapack_int* ldq, double* wr, double* wi, lapack_int* m,
+                    double* s, double* sep, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* w, lapack_int* m, float* s,
+                    float* sep, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_ztrsen( char* job, char* compq, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* w, lapack_int* m, double* s,
+                    double* sep, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_strsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_dtrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, double* c,
+                    lapack_int* ldc, double* scale, lapack_int *info );
+void LAPACK_ctrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* c, lapack_int* ldc,
+                    float* scale, lapack_int *info );
+void LAPACK_ztrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* c, lapack_int* ldc,
+                    double* scale, lapack_int *info );
+void LAPACK_sgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_dgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int *info );
+void LAPACK_cgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_zgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_int *info );
+void LAPACK_sggbal( char* job, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, lapack_int* ilo, lapack_int* ihi,
+                    float* lscale, float* rscale, float* work,
+                    lapack_int *info );
+void LAPACK_dggbal( char* job, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, lapack_int* ilo,
+                    lapack_int* ihi, double* lscale, double* rscale,
+                    double* work, lapack_int *info );
+void LAPACK_cggbal( char* job, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* work, lapack_int *info );
+void LAPACK_zggbal( char* job, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* work, lapack_int *info );
+void LAPACK_sggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_dggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_cggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const float* lscale, const float* rscale,
+                    lapack_int* m, lapack_complex_float* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_zggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
+                    lapack_int* ihi, const double* lscale, const double* rscale,
+                    lapack_int* m, lapack_complex_double* v, lapack_int* ldv,
+                    lapack_int *info );
+void LAPACK_shgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
+                    float* t, lapack_int* ldt, float* alphar, float* alphai,
+                    float* beta, float* q, lapack_int* ldq, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, double* h,
+                    lapack_int* ldh, double* t, lapack_int* ldt, double* alphar,
+                    double* alphai, double* beta, double* q, lapack_int* ldq,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_chgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
+                    lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zhgeqz( char* job, char* compq, char* compz, lapack_int* n,
+                    lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
+                    lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_stgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* s, lapack_int* lds,
+                    const float* p, lapack_int* ldp, float* vl,
+                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, float* work,
+                    lapack_int *info );
+void LAPACK_dtgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* s, lapack_int* lds,
+                    const double* p, lapack_int* ldp, double* vl,
+                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
+                    lapack_int* mm, lapack_int* m, double* work,
+                    lapack_int *info );
+void LAPACK_ctgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* s,
+                    lapack_int* lds, const lapack_complex_float* p,
+                    lapack_int* ldp, lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work, float* rwork,
+                    lapack_int *info );
+void LAPACK_ztgevc( char* side, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* s,
+                    lapack_int* lds, const lapack_complex_double* p,
+                    lapack_int* ldp, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int *info );
+void LAPACK_stgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dtgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* q, lapack_int* ldq, double* z, lapack_int* ldz,
+                    lapack_int* ifst, lapack_int* ilst, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ctgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_ztgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst,
+                    lapack_int* ilst, lapack_int *info );
+void LAPACK_stgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
+                    lapack_int* m, float* pl, float* pr, float* dif,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dtgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* q, lapack_int* ldq, double* z,
+                    lapack_int* ldz, lapack_int* m, double* pl, double* pr,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ctgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* z, lapack_int* ldz, lapack_int* m,
+                    float* pl, float* pr, float* dif,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_ztgsen( lapack_int* ijob, lapack_logical* wantq,
+                    lapack_logical* wantz, const lapack_logical* select,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* z, lapack_int* ldz, lapack_int* m,
+                    double* pl, double* pr, double* dif,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_stgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const float* a, lapack_int* lda, const float* b,
+                    lapack_int* ldb, float* c, lapack_int* ldc, const float* d,
+                    lapack_int* ldd, const float* e, lapack_int* lde, float* f,
+                    lapack_int* ldf, float* scale, float* dif, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dtgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const double* a, lapack_int* lda, const double* b,
+                    lapack_int* ldb, double* c, lapack_int* ldc,
+                    const double* d, lapack_int* ldd, const double* e,
+                    lapack_int* lde, double* f, lapack_int* ldf, double* scale,
+                    double* dif, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ctgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    const lapack_complex_float* d, lapack_int* ldd,
+                    const lapack_complex_float* e, lapack_int* lde,
+                    lapack_complex_float* f, lapack_int* ldf, float* scale,
+                    float* dif, lapack_complex_float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    const lapack_complex_double* d, lapack_int* ldd,
+                    const lapack_complex_double* e, lapack_int* lde,
+                    lapack_complex_double* f, lapack_int* ldf, double* scale,
+                    double* dif, lapack_complex_double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_stgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const float* a, lapack_int* lda,
+                    const float* b, lapack_int* ldb, const float* vl,
+                    lapack_int* ldvl, const float* vr, lapack_int* ldvr,
+                    float* s, float* dif, lapack_int* mm, lapack_int* m,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dtgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const double* a, lapack_int* lda,
+                    const double* b, lapack_int* ldb, const double* vl,
+                    lapack_int* ldvl, const double* vr, lapack_int* ldvr,
+                    double* s, double* dif, lapack_int* mm, lapack_int* m,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_ctgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* b,
+                    lapack_int* ldb, const lapack_complex_float* vl,
+                    lapack_int* ldvl, const lapack_complex_float* vr,
+                    lapack_int* ldvr, float* s, float* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_ztgsna( char* job, char* howmny, const lapack_logical* select,
+                    lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* b,
+                    lapack_int* ldb, const lapack_complex_double* vl,
+                    lapack_int* ldvl, const lapack_complex_double* vr,
+                    lapack_int* ldvr, double* s, double* dif, lapack_int* mm,
+                    lapack_int* m, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, float* tola, float* tolb,
+                    lapack_int* k, lapack_int* l, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    lapack_int* iwork, float* tau, float* work,
+                    lapack_int *info );
+void LAPACK_dggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, double* tola, double* tolb,
+                    lapack_int* k, lapack_int* l, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* tau, double* work,
+                    lapack_int *info );
+void LAPACK_cggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    float* tola, float* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork,
+                    float* rwork, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    double* tola, double* tolb, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_int* iwork, double* rwork,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_stgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* tola, float* tolb, float* alpha, float* beta,
+                    float* u, lapack_int* ldu, float* v, lapack_int* ldv,
+                    float* q, lapack_int* ldq, float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_dtgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* tola, double* tolb, double* alpha, double* beta,
+                    double* u, lapack_int* ldu, double* v, lapack_int* ldv,
+                    double* q, lapack_int* ldq, double* work,
+                    lapack_int* ncycle, lapack_int *info );
+void LAPACK_ctgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* tola,
+                    float* tolb, float* alpha, float* beta,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_ztgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* tola,
+                    double* tolb, double* alpha, double* beta,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, lapack_int* ncycle,
+                    lapack_int *info );
+void LAPACK_sgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_zgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_sgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* jpvt, float* rcond, lapack_int* rank,
+                    float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* jpvt, double* rcond, lapack_int* rank,
+                    double* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int *info );
+void LAPACK_zgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* s,
+                    float* rcond, lapack_int* rank, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* s,
+                    double* rcond, lapack_int* rank,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_sgglse( lapack_int* m, lapack_int* n, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* c,
+                    float* d, float* x, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dgglse( lapack_int* m, lapack_int* n, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* c,
+                    double* d, double* x, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* c, lapack_complex_float* d,
+                    lapack_complex_float* x, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zgglse( lapack_int* m, lapack_int* n, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* c, lapack_complex_double* d,
+                    lapack_complex_double* x, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggglm( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb, float* d,
+                    float* x, float* y, float* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_dggglm( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb, double* d,
+                    double* x, double* y, double* work, lapack_int* lwork,
+                    lapack_int *info );
+void LAPACK_cggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* d, lapack_complex_float* x,
+                    lapack_complex_float* y, lapack_complex_float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_zggglm( lapack_int* n, lapack_int* m, lapack_int* p,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* d, lapack_complex_double* x,
+                    lapack_complex_double* y, lapack_complex_double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_ssyev( char* jobz, char* uplo, lapack_int* n, float* a,
+                   lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dsyev( char* jobz, char* uplo, lapack_int* n, double* a,
+                   lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_cheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zheev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssyevd( char* jobz, char* uplo, lapack_int* n, float* a,
+                    lapack_int* lda, float* w, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevd( char* jobz, char* uplo, lapack_int* n, double* a,
+                    lapack_int* lda, double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsyevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_cheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zheevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_ssyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    lapack_int* isuppz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsyevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    lapack_int* isuppz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_cheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zheevr( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_int* isuppz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspev( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                   float* z, lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dspev( char* jobz, char* uplo, lapack_int* n, double* ap, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhpev( char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspevd( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dspevd( char* jobz, char* uplo, lapack_int* n, double* ap,
+                    double* w, double* z, lapack_int* ldz, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_chpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_zhpevd( char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    float* ap, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dspevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    double* ap, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   float* ab, lapack_int* ldab, float* w, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dsbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   double* ab, lapack_int* ldab, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_float* ab, lapack_int* ldab, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                   lapack_complex_double* ab, lapack_int* ldab, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    float* ab, lapack_int* ldab, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    double* ab, lapack_int* ldab, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_float* ab, lapack_int* ldab, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
+                    lapack_complex_double* ab, lapack_int* ldab, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, float* ab, lapack_int* ldab, float* q,
+                    lapack_int* ldq, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, double* ab, lapack_int* ldab, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* q, lapack_int* ldq, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbevx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* q, lapack_int* ldq, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sstev( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                   lapack_int* ldz, float* work, lapack_int *info );
+void LAPACK_dstev( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_sstevd( char* jobz, lapack_int* n, float* d, float* e, float* z,
+                    lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_dstevd( char* jobz, lapack_int* n, double* d, double* e, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_sstevx( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dstevx( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sstevr( char* jobz, char* range, lapack_int* n, float* d, float* e,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, lapack_int* isuppz, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_dstevr( char* jobz, char* range, lapack_int* n, double* d,
+                    double* e, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, lapack_int* isuppz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sgees( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                   lapack_int* n, float* a, lapack_int* lda, lapack_int* sdim,
+                   float* wr, float* wi, float* vs, lapack_int* ldvs,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgees( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                   lapack_int* n, double* a, lapack_int* lda, lapack_int* sdim,
+                   double* wr, double* wi, double* vs, lapack_int* ldvs,
+                   double* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_cgees( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                   lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_float* w,
+                   lapack_complex_float* vs, lapack_int* ldvs,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgees( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                   lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   lapack_int* sdim, lapack_complex_double* w,
+                   lapack_complex_double* vs, lapack_int* ldvs,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sgeesx( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
+                    char* sense, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* sdim, float* wr, float* wi, float* vs,
+                    lapack_int* ldvs, float* rconde, float* rcondv, float* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dgeesx( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
+                    char* sense, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* sdim, double* wr, double* wi, double* vs,
+                    lapack_int* ldvs, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cgeesx( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
+                    lapack_complex_float* vs, lapack_int* ldvs, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zgeesx( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
+                    char* sense, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
+                    lapack_complex_double* vs, lapack_int* ldvs, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sgeev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* wr, float* wi, float* vl,
+                   lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_dgeev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* wr, double* wi, double* vl,
+                   lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* w, lapack_complex_float* vl,
+                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zgeev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* w, lapack_complex_double* vl,
+                   lapack_int* ldvl, lapack_complex_double* vr,
+                   lapack_int* ldvr, lapack_complex_double* work,
+                   lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* wr,
+                    float* wi, float* vl, lapack_int* ldvl, float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_dgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* wr,
+                    double* wi, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_cgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* w, lapack_complex_float* vl,
+                    lapack_int* ldvl, lapack_complex_float* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    float* scale, float* abnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* w, lapack_complex_double* vl,
+                    lapack_int* ldvl, lapack_complex_double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* scale, double* abnrm, double* rconde,
+                    double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int *info );
+void LAPACK_sgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    float* a, lapack_int* lda, float* s, float* u,
+                    lapack_int* ldu, float* vt, lapack_int* ldvt, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_dgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    double* a, lapack_int* lda, double* s, double* u,
+                    lapack_int* ldu, double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_cgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int *info );
+void LAPACK_zgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int *info );
+void LAPACK_sgesdd( char* jobz, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* s, float* u, lapack_int* ldu,
+                    float* vt, lapack_int* ldvt, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesdd( char* jobz, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* s, double* u, lapack_int* ldu,
+                    double* vt, lapack_int* ldvt, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_cgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda, float* s,
+                    lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* vt, lapack_int* ldvt,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_zgesdd( char* jobz, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda, double* s,
+                    lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* vt, lapack_int* ldvt,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* iwork, lapack_int *info );
+void LAPACK_dgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, double* a,
+                    lapack_int* lda, double* sva, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_sgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
+                    char* jobp, lapack_int* m, lapack_int* n, float* a,
+                    lapack_int* lda, float* sva, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_dgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, double* a, lapack_int* lda, double* sva,
+                    lapack_int* mv, double* v, lapack_int* ldv, double* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
+                    lapack_int* n, float* a, lapack_int* lda, float* sva,
+                    lapack_int* mv, float* v, lapack_int* ldv, float* work,
+                    lapack_int* lwork, lapack_int *info );
+void LAPACK_sggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* alpha, float* beta, float* u, lapack_int* ldu,
+                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
+                    float* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_dggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* alpha, double* beta, double* u, lapack_int* ldu,
+                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                    double* work, lapack_int* iwork, lapack_int *info );
+void LAPACK_cggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* alpha,
+                    float* beta, lapack_complex_float* u, lapack_int* ldu,
+                    lapack_complex_float* v, lapack_int* ldv,
+                    lapack_complex_float* q, lapack_int* ldq,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int *info );
+void LAPACK_zggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
+                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* alpha,
+                    double* beta, lapack_complex_double* u, lapack_int* ldu,
+                    lapack_complex_double* v, lapack_int* ldv,
+                    lapack_complex_double* q, lapack_int* ldq,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int *info );
+void LAPACK_ssygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* w, float* work, lapack_int* lwork, lapack_int *info );
+void LAPACK_dsygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* w, double* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_chegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zhegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* w,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_ssygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    float* w, float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* w, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* w,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* w,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* vl, float* vu, lapack_int* il,
+                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_dsygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_chegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, float* vl,
+                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_sspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   double* ap, double* bp, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int *info );
+void LAPACK_chpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_float* ap, lapack_complex_float* bp, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                   lapack_complex_double* ap, lapack_complex_double* bp,
+                   double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_sspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    float* ap, float* bp, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    double* ap, double* bp, double* w, double* z,
+                    lapack_int* ldz, double* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
+void LAPACK_chpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_float* ap, lapack_complex_float* bp,
+                    float* w, lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
+                    lapack_complex_double* ap, lapack_complex_double* bp,
+                    double* w, lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_sspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, float* ap, float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_dspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, double* ap, double* bp, double* vl,
+                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* iwork, lapack_int* ifail,
+                    lapack_int *info );
+void LAPACK_chpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_float* ap,
+                    lapack_complex_float* bp, float* vl, float* vu,
+                    lapack_int* il, lapack_int* iu, float* abstol,
+                    lapack_int* m, float* w, lapack_complex_float* z,
+                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_zhpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
+                    lapack_int* n, lapack_complex_double* ap,
+                    lapack_complex_double* bp, double* vl, double* vu,
+                    lapack_int* il, lapack_int* iu, double* abstol,
+                    lapack_int* m, double* w, lapack_complex_double* z,
+                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_ssbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                   lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int *info );
+void LAPACK_dsbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                   lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int *info );
+void LAPACK_chbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                   lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int *info );
+void LAPACK_zhbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                   lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                   lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork,
+                   lapack_int *info );
+void LAPACK_ssbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
+                    lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_dsbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
+                    lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_chbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
+                    lapack_complex_float* bb, lapack_int* ldbb, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                    lapack_int *info );
+void LAPACK_zhbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
+                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
+                    lapack_complex_double* bb, lapack_int* ldbb, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_int *info );
+void LAPACK_ssbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                    float* bb, lapack_int* ldbb, float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w, float* z,
+                    lapack_int* ldz, float* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_dsbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, double* ab,
+                    lapack_int* ldab, double* bb, lapack_int* ldbb, double* q,
+                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_chbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                    lapack_int* ldab, lapack_complex_float* bb,
+                    lapack_int* ldbb, lapack_complex_float* q, lapack_int* ldq,
+                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                    float* abstol, lapack_int* m, float* w,
+                    lapack_complex_float* z, lapack_int* ldz,
+                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                    lapack_int* ifail, lapack_int *info );
+void LAPACK_zhbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
+                    lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                    lapack_int* ldab, lapack_complex_double* bb,
+                    lapack_int* ldbb, lapack_complex_double* q, lapack_int* ldq,
+                    double* vl, double* vu, lapack_int* il, lapack_int* iu,
+                    double* abstol, lapack_int* m, double* w,
+                    lapack_complex_double* z, lapack_int* ldz,
+                    lapack_complex_double* work, double* rwork,
+                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
+void LAPACK_sgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim,
+                   float* alphar, float* alphai, float* beta, float* vsl,
+                   lapack_int* ldvsl, float* vsr, lapack_int* ldvsr,
+                   float* work, lapack_int* lwork, lapack_logical* bwork,
+                   lapack_int *info );
+void LAPACK_dgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb,
+                   lapack_int* sdim, double* alphar, double* alphai,
+                   double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                   lapack_int* ldvsr, double* work, lapack_int* lwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_cgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_C_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vsl, lapack_int* ldvsl,
+                   lapack_complex_float* vsr, lapack_int* ldvsr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_logical* bwork, lapack_int *info );
+void LAPACK_zgges( char* jobvsl, char* jobvsr, char* sort,
+                   LAPACK_Z_SELECT2 selctg, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vsl, lapack_int* ldvsl,
+                   lapack_complex_double* vsr, lapack_int* ldvsr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_logical* bwork, lapack_int *info );
+void LAPACK_sggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
+                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                    lapack_int* sdim, float* alphar, float* alphai, float* beta,
+                    float* vsl, lapack_int* ldvsl, float* vsr,
+                    lapack_int* ldvsr, float* rconde, float* rcondv,
+                    float* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    lapack_int* sdim, double* alphar, double* alphai,
+                    double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
+                    lapack_int* ldvsr, double* rconde, double* rcondv,
+                    double* work, lapack_int* lwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vsl, lapack_int* ldvsl,
+                    lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde,
+                    float* rcondv, lapack_complex_float* work,
+                    lapack_int* lwork, float* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggesx( char* jobvsl, char* jobvsr, char* sort,
+                    LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vsl, lapack_int* ldvsl,
+                    lapack_complex_double* vsr, lapack_int* ldvsr,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_int* liwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_sggev( char* jobvl, char* jobvr, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, float* alphar,
+                   float* alphai, float* beta, float* vl, lapack_int* ldvl,
+                   float* vr, lapack_int* ldvr, float* work, lapack_int* lwork,
+                   lapack_int *info );
+void LAPACK_dggev( char* jobvl, char* jobvr, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
+                   double* alphai, double* beta, double* vl, lapack_int* ldvl,
+                   double* vr, lapack_int* ldvr, double* work,
+                   lapack_int* lwork, lapack_int *info );
+void LAPACK_cggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* vl, lapack_int* ldvl,
+                   lapack_complex_float* vr, lapack_int* ldvr,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int *info );
+void LAPACK_zggev( char* jobvl, char* jobvr, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vl, lapack_int* ldvl,
+                   lapack_complex_double* vr, lapack_int* ldvr,
+                   lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int *info );
+void LAPACK_sggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
+                    float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, float* lscale,
+                    float* rscale, float* abnrm, float* bbnrm, float* rconde,
+                    float* rcondv, float* work, lapack_int* lwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_dggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, double* alphar, double* alphai,
+                    double* beta, double* vl, lapack_int* ldvl, double* vr,
+                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
+                    double* lscale, double* rscale, double* abnrm,
+                    double* bbnrm, double* rconde, double* rcondv, double* work,
+                    lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_cggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* vl, lapack_int* ldvl,
+                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo,
+                    lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
+                    float* bbnrm, float* rconde, float* rcondv,
+                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                    lapack_int* iwork, lapack_logical* bwork,
+                    lapack_int *info );
+void LAPACK_zggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
+                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* vl, lapack_int* ldvl,
+                    lapack_complex_double* vr, lapack_int* ldvr,
+                    lapack_int* ilo, lapack_int* ihi, double* lscale,
+                    double* rscale, double* abnrm, double* bbnrm,
+                    double* rconde, double* rcondv, lapack_complex_double* work,
+                    lapack_int* lwork, double* rwork, lapack_int* iwork,
+                    lapack_logical* bwork, lapack_int *info );
+void LAPACK_dsfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const double* a,
+                   lapack_int* lda, double* beta, double* c );
+void LAPACK_ssfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const float* a, lapack_int* lda,
+                   float* beta, float* c );
+void LAPACK_zhfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, double* alpha, const lapack_complex_double* a,
+                   lapack_int* lda, double* beta, lapack_complex_double* c );
+void LAPACK_chfrk( char* transr, char* uplo, char* trans, lapack_int* n,
+                   lapack_int* k, float* alpha, const lapack_complex_float* a,
+                   lapack_int* lda, float* beta, lapack_complex_float* c );
+void LAPACK_dtfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, double* alpha,
+                   const double* a, double* b, lapack_int* ldb );
+void LAPACK_stfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n, float* alpha,
+                   const float* a, float* b, lapack_int* ldb );
+void LAPACK_ztfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_double* alpha, const lapack_complex_double* a,
+                   lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_ctfsm( char* transr, char* side, char* uplo, char* trans,
+                   char* diag, lapack_int* m, lapack_int* n,
+                   lapack_complex_float* alpha, const lapack_complex_float* a,
+                   lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_dtfttp( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* ap, lapack_int *info );
+void LAPACK_stfttp( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* ap, lapack_int *info );
+void LAPACK_ztfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctfttp( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_dtfttr( char* transr, char* uplo, lapack_int* n, const double* arf,
+                    double* a, lapack_int* lda, lapack_int *info );
+void LAPACK_stfttr( char* transr, char* uplo, lapack_int* n, const float* arf,
+                    float* a, lapack_int* lda, lapack_int *info );
+void LAPACK_ztfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* arf, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ctfttr( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* arf, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_dtpttf( char* transr, char* uplo, lapack_int* n, const double* ap,
+                    double* arf, lapack_int *info );
+void LAPACK_stpttf( char* transr, char* uplo, lapack_int* n, const float* ap,
+                    float* arf, lapack_int *info );
+void LAPACK_ztpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* ap, lapack_complex_double* arf,
+                    lapack_int *info );
+void LAPACK_ctpttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* ap, lapack_complex_float* arf,
+                    lapack_int *info );
+void LAPACK_dtpttr( char* uplo, lapack_int* n, const double* ap, double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_stpttr( char* uplo, lapack_int* n, const float* ap, float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_ztpttr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_ctpttr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dtrttf( char* transr, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* arf, lapack_int *info );
+void LAPACK_strttf( char* transr, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* arf, lapack_int *info );
+void LAPACK_ztrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* arf, lapack_int *info );
+void LAPACK_ctrttf( char* transr, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* arf, lapack_int *info );
+void LAPACK_dtrttp( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
+                    double* ap, lapack_int *info );
+void LAPACK_strttp( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
+                    float* ap, lapack_int *info );
+void LAPACK_ztrttp( char* uplo, lapack_int* n, const lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* ap,
+                    lapack_int *info );
+void LAPACK_ctrttp( char* uplo, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* ap,
+                    lapack_int *info );
+void LAPACK_sgeqrfp( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* tau, float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_dgeqrfp( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* tau, double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_cgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* tau,
+                     lapack_complex_float* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_zgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* tau,
+                     lapack_complex_double* work, lapack_int* lwork,
+                     lapack_int *info );
+void LAPACK_clacgv( lapack_int* n, lapack_complex_float* x, lapack_int* incx );
+void LAPACK_zlacgv( lapack_int* n, lapack_complex_double* x, lapack_int* incx );
+void LAPACK_slarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    float* x );
+void LAPACK_dlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    double* x );
+void LAPACK_clarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_float* x );
+void LAPACK_zlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
+                    lapack_complex_double* x );
+void LAPACK_sgeqr2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgeqr2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgeqr2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqr2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slacpy( char* uplo, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* b, lapack_int* ldb );
+void LAPACK_dlacpy( char* uplo, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* b, lapack_int* ldb );
+void LAPACK_clacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb );
+void LAPACK_zlacpy( char* uplo, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb );
+void LAPACK_sgetf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_dgetf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* ipiv, lapack_int *info );
+void LAPACK_cgetf2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_zgetf2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
+void LAPACK_slaswp( lapack_int* n, float* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_dlaswp( lapack_int* n, double* a, lapack_int* lda, lapack_int* k1,
+                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
+void LAPACK_claswp( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+void LAPACK_zlaswp( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                    lapack_int* incx );
+float LAPACK_slange( char* norm, lapack_int* m, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlange( char* norm, lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlange( char* norm, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_clanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlanhe( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slansy( char* norm, char* uplo, lapack_int* n, const float* a,
+                    lapack_int* lda, float* work );
+double LAPACK_dlansy( char* norm, char* uplo, lapack_int* n, const double* a,
+                    lapack_int* lda, double* work );
+float LAPACK_clansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_float* a, lapack_int* lda, float* work );
+double LAPACK_zlansy( char* norm, char* uplo, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda, double* work );
+float LAPACK_slantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const float* a, lapack_int* lda, float* work );
+double LAPACK_dlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const double* a, lapack_int* lda, double* work );
+float LAPACK_clantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                    float* work );
+double LAPACK_zlantr( char* norm, char* uplo, char* diag, lapack_int* m,
+                    lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                    double* work );
+float LAPACK_slamch( char* cmach );
+double LAPACK_dlamch( char* cmach );
+void LAPACK_sgelq2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    float* tau, float* work, lapack_int *info );
+void LAPACK_dgelq2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    double* tau, double* work, lapack_int *info );
+void LAPACK_cgelq2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgelq2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, const float* v,
+                    lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                    lapack_int* ldc, float* work, lapack_int* ldwork );
+void LAPACK_dlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* c, lapack_int* ldc, double* work,
+                    lapack_int* ldwork );
+void LAPACK_clarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work, lapack_int* ldwork );
+void LAPACK_zlarfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work, lapack_int* ldwork );
+void LAPACK_slarfg( lapack_int* n, float* alpha, float* x, lapack_int* incx,
+                    float* tau );
+void LAPACK_dlarfg( lapack_int* n, double* alpha, double* x, lapack_int* incx,
+                    double* tau );
+void LAPACK_clarfg( lapack_int* n, lapack_complex_float* alpha,
+                    lapack_complex_float* x, lapack_int* incx,
+                    lapack_complex_float* tau );
+void LAPACK_zlarfg( lapack_int* n, lapack_complex_double* alpha,
+                    lapack_complex_double* x, lapack_int* incx,
+                    lapack_complex_double* tau );
+void LAPACK_slarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const float* v, lapack_int* ldv, const float* tau, float* t,
+                    lapack_int* ldt );
+void LAPACK_dlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const double* v, lapack_int* ldv, const double* tau,
+                    double* t, lapack_int* ldt );
+void LAPACK_clarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* tau, lapack_complex_float* t,
+                    lapack_int* ldt );
+void LAPACK_zlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* tau, lapack_complex_double* t,
+                    lapack_int* ldt );
+void LAPACK_slarfx( char* side, lapack_int* m, lapack_int* n, const float* v,
+                    float* tau, float* c, lapack_int* ldc, float* work );
+void LAPACK_dlarfx( char* side, lapack_int* m, lapack_int* n, const double* v,
+                    double* tau, double* c, lapack_int* ldc, double* work );
+void LAPACK_clarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* v, lapack_complex_float* tau,
+                    lapack_complex_float* c, lapack_int* ldc,
+                    lapack_complex_float* work );
+void LAPACK_zlarfx( char* side, lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* v, lapack_complex_double* tau,
+                    lapack_complex_double* c, lapack_int* ldc,
+                    lapack_complex_double* work );
+void LAPACK_slatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    float* a, lapack_int* lda, float* work, lapack_int *info );
+void LAPACK_dlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    double* a, lapack_int* lda, double* work,
+                    lapack_int *info );
+void LAPACK_clatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, float* d, lapack_int* mode, float* cond,
+                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
+                    char* sym, double* d, lapack_int* mode, double* cond,
+                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slag2d( lapack_int* m, lapack_int* n, const float* sa,
+                    lapack_int* ldsa, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlag2s( lapack_int* m, lapack_int* n, const double* a,
+                    lapack_int* lda, float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_clag2z( lapack_int* m, lapack_int* n,
+                    const lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_zlag2c( lapack_int* m, lapack_int* n,
+                    const lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_float* sa, lapack_int* ldsa,
+                    lapack_int *info );
+void LAPACK_slauum( char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_dlauum( char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int *info );
+void LAPACK_clauum( char* uplo, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_zlauum( char* uplo, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int *info );
+void LAPACK_slagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, float* a, lapack_int* lda,
+                    lapack_int* iseed, float* work, lapack_int *info );
+void LAPACK_dlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, double* a, lapack_int* lda,
+                    lapack_int* iseed, double* work, lapack_int *info );
+void LAPACK_clagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const float* d, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
+                    lapack_int* ku, const double* d, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_slaset( char* uplo, lapack_int* m, lapack_int* n, float* alpha,
+                    float* beta, float* a, lapack_int* lda );
+void LAPACK_dlaset( char* uplo, lapack_int* m, lapack_int* n, double* alpha,
+                    double* beta, double* a, lapack_int* lda );
+void LAPACK_claset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* alpha, lapack_complex_float* beta,
+                    lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zlaset( char* uplo, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* alpha, lapack_complex_double* beta,
+                    lapack_complex_double* a, lapack_int* lda );
+void LAPACK_slasrt( char* id, lapack_int* n, float* d, lapack_int *info );
+void LAPACK_dlasrt( char* id, lapack_int* n, double* d, lapack_int *info );
+void LAPACK_claghe( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlaghe( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slagsy( lapack_int* n, lapack_int* k, const float* d, float* a,
+                    lapack_int* lda, lapack_int* iseed, float* work,
+                    lapack_int *info );
+void LAPACK_dlagsy( lapack_int* n, lapack_int* k, const double* d, double* a,
+                    lapack_int* lda, lapack_int* iseed, double* work,
+                    lapack_int *info );
+void LAPACK_clagsy( lapack_int* n, lapack_int* k, const float* d,
+                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zlagsy( lapack_int* n, lapack_int* k, const double* d,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_int* iseed, lapack_complex_double* work,
+                    lapack_int *info );
+void LAPACK_slapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_dlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    double* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_clapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_float* x, lapack_int* ldx, lapack_int* k );
+void LAPACK_zlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
+                    lapack_complex_double* x, lapack_int* ldx, lapack_int* k );
+float LAPACK_slapy2( float* x, float* y );
+double LAPACK_dlapy2( double* x, double* y );
+float LAPACK_slapy3( float* x, float* y, float* z );
+double LAPACK_dlapy3( double* x, double* y, double* z );
+void LAPACK_slartgp( float* f, float* g, float* cs, float* sn, float* r );
+void LAPACK_dlartgp( double* f, double* g, double* cs, double* sn, double* r );
+void LAPACK_slartgs( float* x, float* y, float* sigma, float* cs, float* sn );
+void LAPACK_dlartgs( double* x, double* y, double* sigma, double* cs,
+                     double* sn );
+// LAPACK 3.3.0
+void LAPACK_cbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    float* b11d, float* b11e, float* b12d,
+                    float* b12e, float* b21d, float* b21e,
+                    float* b22d, float* b22e, float* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_cheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_chetri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_chetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_chetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_csyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_csytri2( char* uplo, lapack_int* n,
+                     lapack_complex_float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_csytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_float* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
+void LAPACK_csytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const lapack_complex_float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work , lapack_int *info );
+void LAPACK_cunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_float* x11, lapack_int* ldx11,
+                    lapack_complex_float* x12, lapack_int* ldx12,
+                    lapack_complex_float* x21, lapack_int* ldx21,
+                    lapack_complex_float* x22, lapack_int* ldx22,
+                    float* theta, float* phi,
+                    lapack_complex_float* taup1,
+                    lapack_complex_float* taup2,
+                    lapack_complex_float* tauq1,
+                    lapack_complex_float* tauq2,
+                    lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_cuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_float* x11,
+                    lapack_int* ldx11, lapack_complex_float* x12,
+                    lapack_int* ldx12, lapack_complex_float* x21,
+                    lapack_int* ldx21, lapack_complex_float* x22,
+                    lapack_int* ldx22, float* theta,
+                    lapack_complex_float* u1, lapack_int* ldu1,
+                    lapack_complex_float* u2, lapack_int* ldu2,
+                    lapack_complex_float* v1t, lapack_int* ldv1t,
+                    lapack_complex_float* v2t, lapack_int* ldv2t,
+                    lapack_complex_float* work, lapack_int* lwork,
+                    float* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi, double* u1,
+                    lapack_int* ldu1, double* u2, lapack_int* ldu2,
+                    double* v1t, lapack_int* ldv1t, double* v2t,
+                    lapack_int* ldv2t, double* b11d, double* b11e,
+                    double* b12d, double* b12e, double* b21d,
+                    double* b21e, double* b22d, double* b22e,
+                    double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* x11, lapack_int* ldx11, double* x12,
+                    lapack_int* ldx12, double* x21, lapack_int* ldx21,
+                    double* x22, lapack_int* ldx22, double* theta,
+                    double* phi, double* taup1, double* taup2,
+                    double* tauq1, double* tauq2, double* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_dorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, double* x11, lapack_int* ldx11,
+                    double* x12, lapack_int* ldx12, double* x21,
+                    lapack_int* ldx21, double* x22, lapack_int* ldx22,
+                    double* theta, double* u1, lapack_int* ldu1,
+                    double* u2, lapack_int* ldu2, double* v1t,
+                    lapack_int* ldv1t, double* v2t, lapack_int* ldv2t,
+                    double* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_dsyconv( char* uplo, char* way,
+                     lapack_int* n, double* a, lapack_int* lda,
+                     const lapack_int* ipiv, double* work , lapack_int *info );
+void LAPACK_dsyswapr( char* uplo, lapack_int* n,
+                      double* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_dsytri2( char* uplo, lapack_int* n,
+                     double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_dsytri2x( char* uplo, lapack_int* n,
+                      double* a, lapack_int* lda,
+                      const lapack_int* ipiv, double* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_dsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     double* b, lapack_int* ldb, double* work , lapack_int *info );
+void LAPACK_sbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* theta, float* phi, float* u1,
+                    lapack_int* ldu1, float* u2, lapack_int* ldu2,
+                    float* v1t, lapack_int* ldv1t, float* v2t,
+                    lapack_int* ldv2t, float* b11d, float* b11e,
+                    float* b12d, float* b12e, float* b21d,
+                    float* b21e, float* b22d, float* b22e,
+                    float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_sorbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    float* x11, lapack_int* ldx11, float* x12,
+                    lapack_int* ldx12, float* x21, lapack_int* ldx21,
+                    float* x22, lapack_int* ldx22, float* theta,
+                    float* phi, float* taup1, float* taup2,
+                    float* tauq1, float* tauq2, float* work,
+                    lapack_int* lwork , lapack_int *info );
+void LAPACK_sorcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, float* x11, lapack_int* ldx11,
+                    float* x12, lapack_int* ldx12, float* x21,
+                    lapack_int* ldx21, float* x22, lapack_int* ldx22,
+                    float* theta, float* u1, lapack_int* ldu1,
+                    float* u2, lapack_int* ldu2, float* v1t,
+                    lapack_int* ldv1t, float* v2t, lapack_int* ldv2t,
+                    float* work, lapack_int* lwork,
+                    lapack_int* iwork , lapack_int *info );
+void LAPACK_ssyconv( char* uplo, char* way,
+                     lapack_int* n, float* a, lapack_int* lda,
+                     const lapack_int* ipiv, float* work , lapack_int *info );
+void LAPACK_ssyswapr( char* uplo, lapack_int* n,
+                      float* a, lapack_int* i1, lapack_int* i2 );
+void LAPACK_ssytri2( char* uplo, lapack_int* n,
+                     float* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_ssytri2x( char* uplo, lapack_int* n,
+                      float* a, lapack_int* lda,
+                      const lapack_int* ipiv, float* work,
+                      lapack_int* nb , lapack_int *info );
+void LAPACK_ssytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs, const float* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     float* b, lapack_int* ldb, float* work , lapack_int *info );
+void LAPACK_zbbcsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    double* theta, double* phi,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    double* b11d, double* b11e, double* b12d,
+                    double* b12e, double* b21d, double* b21e,
+                    double* b22d, double* b22e, double* rwork,
+                    lapack_int* lrwork , lapack_int *info );
+void LAPACK_zheswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zhetri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zhetri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zhetrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyconv( char* uplo, char* way,
+                     lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zsyswapr( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* i1,
+                      lapack_int* i2 );
+void LAPACK_zsytri2( char* uplo, lapack_int* n,
+                     lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zsytri2x( char* uplo, lapack_int* n,
+                      lapack_complex_double* a, lapack_int* lda,
+                      const lapack_int* ipiv,
+                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
+void LAPACK_zsytrs2( char* uplo, lapack_int* n,
+                     lapack_int* nrhs,
+                     const lapack_complex_double* a, lapack_int* lda,
+                     const lapack_int* ipiv,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work , lapack_int *info );
+void LAPACK_zunbdb( char* trans, char* signs,
+                    lapack_int* m, lapack_int* p, lapack_int* q,
+                    lapack_complex_double* x11, lapack_int* ldx11,
+                    lapack_complex_double* x12, lapack_int* ldx12,
+                    lapack_complex_double* x21, lapack_int* ldx21,
+                    lapack_complex_double* x22, lapack_int* ldx22,
+                    double* theta, double* phi,
+                    lapack_complex_double* taup1,
+                    lapack_complex_double* taup2,
+                    lapack_complex_double* tauq1,
+                    lapack_complex_double* tauq2,
+                    lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
+void LAPACK_zuncsd( char* jobu1, char* jobu2,
+                    char* jobv1t, char* jobv2t, char* trans,
+                    char* signs, lapack_int* m, lapack_int* p,
+                    lapack_int* q, lapack_complex_double* x11,
+                    lapack_int* ldx11, lapack_complex_double* x12,
+                    lapack_int* ldx12, lapack_complex_double* x21,
+                    lapack_int* ldx21, lapack_complex_double* x22,
+                    lapack_int* ldx22, double* theta,
+                    lapack_complex_double* u1, lapack_int* ldu1,
+                    lapack_complex_double* u2, lapack_int* ldu2,
+                    lapack_complex_double* v1t, lapack_int* ldv1t,
+                    lapack_complex_double* v2t, lapack_int* ldv2t,
+                    lapack_complex_double* work, lapack_int* lwork,
+                    double* rwork, lapack_int* lrwork,
+                    lapack_int* iwork , lapack_int *info );
+// LAPACK 3.4.0
+void LAPACK_sgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const float* v,
+                     lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
+                     lapack_int* ldc, float* work, lapack_int *info );
+void LAPACK_dgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb, const double* v,
+                     lapack_int* ldv, const double* t, lapack_int* ldt,
+                     double* c, lapack_int* ldc, double* work,
+                     lapack_int *info );
+void LAPACK_cgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* c, lapack_int* ldc,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* c, lapack_int* ldc,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, float* a,
+                    lapack_int* lda, float* t, lapack_int* ldt, float* work,
+                    lapack_int *info );
+void LAPACK_dgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, double* a,
+                    lapack_int* lda, double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_cgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_zgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_sgeqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_sgeqrt3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_dgeqrt3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* t, lapack_int* ldt, lapack_int *info );
+void LAPACK_cgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_zgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const float* v, lapack_int* ldv, const float* t,
+                     lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                     lapack_int* ldb, float* work, lapack_int *info );
+void LAPACK_dtpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const double* v, lapack_int* ldv, const double* t,
+                     lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                     lapack_int* ldb, double* work, lapack_int *info );
+void LAPACK_ctpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_float* v, lapack_int* ldv,
+                     const lapack_complex_float* t, lapack_int* ldt,
+                     lapack_complex_float* a, lapack_int* lda,
+                     lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
+                     lapack_int* k, lapack_int* l, lapack_int* nb,
+                     const lapack_complex_double* v, lapack_int* ldv,
+                     const lapack_complex_double* t, lapack_int* ldt,
+                     lapack_complex_double* a, lapack_int* lda,
+                     lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* work, lapack_int *info );
+void LAPACK_dtpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                    double* t, lapack_int* ldt, double* work,
+                    lapack_int *info );
+void LAPACK_ctpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* t, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_int* ldt,
+                    lapack_complex_float* work, lapack_int *info );
+void LAPACK_ztpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* work, lapack_int *info );
+void LAPACK_stpqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                     float* b, lapack_int* ldb, float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_dtpqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                     double* b, lapack_int* ldb, double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ctpqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                     lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                     lapack_complex_float* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_ztpqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                     lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                     lapack_complex_double* t, lapack_int* ldt,
+                     lapack_int *info );
+void LAPACK_stprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const float* v, lapack_int* ldv, const float* t,
+                    lapack_int* ldt, float* a, lapack_int* lda, float* b,
+                    lapack_int* ldb, const float* mywork,
+                    lapack_int* myldwork );
+void LAPACK_dtprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const double* v, lapack_int* ldv, const double* t,
+                    lapack_int* ldt, double* a, lapack_int* lda, double* b,
+                    lapack_int* ldb, const double* mywork,
+                    lapack_int* myldwork );
+void LAPACK_ctprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_float* v, lapack_int* ldv,
+                    const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* b, lapack_int* ldb,
+                    const float* mywork, lapack_int* myldwork );
+void LAPACK_ztprfb( char* side, char* trans, char* direct, char* storev,
+                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                    const lapack_complex_double* v, lapack_int* ldv,
+                    const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* b, lapack_int* ldb,
+                    const double* mywork, lapack_int* myldwork );
+// LAPACK 3.X.X
+void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha,
+                      const lapack_complex_float* x, lapack_int* incx,
+                      lapack_complex_float* a, lapack_int* lda );
+void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha,
+                      const lapack_complex_double* x, lapack_int* incx,
+                      lapack_complex_double* a, lapack_int* lda );
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LAPACKE_H_ */
+
+#endif /* _MKL_LAPACKE_H_ */
diff --git a/Eigen/src/misc/lapacke_mangling.h b/Eigen/src/misc/lapacke_mangling.h
new file mode 100644
index 000000000..6211fd144
--- /dev/null
+++ b/Eigen/src/misc/lapacke_mangling.h
@@ -0,0 +1,17 @@
+#ifndef LAPACK_HEADER_INCLUDED
+#define LAPACK_HEADER_INCLUDED
+
+#ifndef LAPACK_GLOBAL
+#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
+#define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
+#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname
+#else
+#define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
+#endif
+#endif
+
+#endif
+
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 5694592d6..62fb303d9 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -1,13 +1,14 @@
+
 /** \returns an expression of the coefficient wise product of \c *this and \a other
   *
   * \sa MatrixBase::cwiseProduct
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient wise quotient of \c *this and \a other
@@ -16,10 +17,10 @@ operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar,typename OtherDerived::Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of \c *this and \a other
@@ -29,14 +30,14 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa max()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
+EIGEN_MAKE_CWISE_BINARY_OP(min,min)
 
 /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
   *
   * \sa max()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 min
@@ -55,14 +56,14 @@ min
   *
   * \sa min()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
+EIGEN_MAKE_CWISE_BINARY_OP(max,max)
 
 /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
   *
   * \sa min()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 max
@@ -81,27 +82,38 @@ max
   * Example: \include Cwise_array_power_array.cpp
   * Output: \verbinclude Cwise_array_power_array.out
   */
-template<typename ExponentDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-pow(const ArrayBase<ExponentDerived>& exponents) const
-{
-  return CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
-    this->derived(),
-    exponents.derived()
-  );
-}
+EIGEN_MAKE_CWISE_BINARY_OP(pow,pow)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(pow,pow)
+#else
+/** \returns an expression of the coefficients of \c *this rasied to the constant power \a exponent
+  *
+  * \tparam T is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression.
+  *
+  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
+  * unsupported module MatrixFunctions computes the matrix power.
+  *
+  * Example: \include Cwise_pow.cpp
+  * Output: \verbinclude Cwise_pow.out
+  *
+  * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_pow_op<Scalar,T>,Derived,Constant<T> > pow(const T& exponent) const;
+#endif
+
 
 // TODO code generating macros could be moved to Macros.h and could include generation of documentation
 #define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
 template<typename OtherDerived> \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
 OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
 { \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
 }\
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
 OP(const Scalar& s) const { \
   return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
@@ -113,10 +125,10 @@ OP(const Scalar& s, const Derived& d) { \
 
 #define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
 template<typename OtherDerived> \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
 OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
 { \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
+  return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
 } \
 EIGEN_DEVICE_FUNC \
 inline const RCmp ## RCOMPARATOR ## ReturnType \
@@ -199,48 +211,63 @@ EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
 #undef EIGEN_MAKE_CWISE_COMP_R_OP
 
 // scalar addition
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator+,sum)
+#else
 /** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
   *
   * Example: \include Cwise_plus.cpp
   * Output: \verbinclude Cwise_plus.out
   *
   * \sa operator+=(), operator-()
   */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
-}
-
-EIGEN_DEVICE_FUNC
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return other + scalar;
-}
+template<typename T>
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar,T>,Derived,Constant<T> > operator+(const T& scalar) const;
+/** \returns an expression of \a expr with each coeff incremented by the constant \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_sum_op<T,Scalar>,Constant<T>,Derived> operator+(const T& scalar, const StorageBaseType& expr);
+#endif
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator-,difference)
+#else
 /** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
   *
   * Example: \include Cwise_minus.cpp
   * Output: \verbinclude Cwise_minus.out
   *
-  * \sa operator+(), operator-=()
+  * \sa operator+=(), operator-()
   */
-EIGEN_DEVICE_FUNC
-inline const CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
-operator-(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>(derived(), internal::scalar_sub_op<Scalar>(scalar));;
-}
+template<typename T>
+const CwiseBinaryOp<internal::scalar_difference_op<Scalar,T>,Derived,Constant<T> > operator-(const T& scalar) const;
+/** \returns an expression of the constant matrix of value \a scalar decremented by the coefficients of \a expr
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_difference_op<T,Scalar>,Constant<T>,Derived> operator-(const T& scalar, const StorageBaseType& expr);
+#endif
 
-EIGEN_DEVICE_FUNC
-friend inline const CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>
-operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>(other.derived(), internal::scalar_rsub_op<Scalar>(scalar));;
-}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(operator/,quotient)
+#else
+  /**
+    * \brief Component-wise division of the scalar \a s by array elements of \a a.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    */
+  template<typename T> friend
+  inline const CwiseBinaryOp<internal::scalar_quotient_op<T,Scalar>,Constant<T>,Derived>
+  operator/(const T& s,const StorageBaseType& a);
+#endif
 
 /** \returns an expression of the coefficient-wise && operator of *this and \a other
   *
@@ -298,3 +325,46 @@ operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
                       THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
   return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived());
 }
+
+// NOTE disabled until we agree on argument order
+#if 0
+/** \cpp11 \returns an expression of the coefficient-wise polygamma function.
+  *
+  * \specialfunctions_module
+  *
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
+  *
+  * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+  *
+  * \sa Eigen::polygamma()
+  */
+template<typename DerivedN>
+inline const CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>
+polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
+{
+  return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
+}
+#endif
+
+/** \returns an expression of the coefficient-wise zeta function.
+  *
+  * \specialfunctions_module
+  *
+  * It returns the Riemann zeta function of two arguments \c *this and \a q:
+  *
+  * \param *this is the exposent, it must be > 1
+  * \param q is the shift, it must be > 0
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+  *
+  * This method is an alias for zeta(*this,q);
+  *
+  * \sa Eigen::zeta()
+  */
+template<typename DerivedQ>
+inline const CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>
+zeta(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedQ> &q) const
+{
+  return CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>(this->derived(), q.derived());
+}
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 56c71172c..ebaa3f192 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -11,6 +11,7 @@ typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> Boo
 
 typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
 typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
+typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
 typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
 typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
 typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
@@ -21,13 +22,6 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
-typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
-typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
-typedef CwiseUnaryOp<internal::scalar_zeta_op<Scalar>, const Derived> ZetaReturnType;
-typedef CwiseUnaryOp<internal::scalar_polygamma_op<Scalar>, const Derived> PolygammaReturnType;
-typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
-typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
-typedef CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> PowReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
 typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
@@ -42,7 +36,7 @@ typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFini
   * Example: \include Cwise_abs.cpp
   * Output: \verbinclude Cwise_abs.out
   *
-  * \sa abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
   */
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const AbsReturnType
@@ -70,7 +64,7 @@ arg() const
   * Example: \include Cwise_abs2.cpp
   * Output: \verbinclude Cwise_abs2.out
   *
-  * \sa abs(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
   */
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const Abs2ReturnType
@@ -87,7 +81,7 @@ abs2() const
   * Example: \include Cwise_exp.cpp
   * Output: \verbinclude Cwise_exp.out
   *
-  * \sa pow(), log(), sin(), cos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, pow(), log(), sin(), cos()
   */
 EIGEN_DEVICE_FUNC
 inline const ExpReturnType
@@ -104,7 +98,7 @@ exp() const
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
-  * \sa exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, exp()
   */
 EIGEN_DEVICE_FUNC
 inline const LogReturnType
@@ -113,6 +107,20 @@ log() const
   return LogReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
+  *
+  * In exact arithmetic, \c x.log() is equivalent to \c (x+1).log(),
+  * however, with finite precision, this function is much more accurate when \c x is close to zero.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
+  */
+EIGEN_DEVICE_FUNC
+inline const Log1pReturnType
+log1p() const
+{
+  return Log1pReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise base-10 logarithm of *this.
   *
   * This function computes the coefficient-wise base-10 logarithm.
@@ -120,7 +128,7 @@ log() const
   * Example: \include Cwise_log10.cpp
   * Output: \verbinclude Cwise_log10.out
   *
-  * \sa log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
   */
 EIGEN_DEVICE_FUNC
 inline const Log10ReturnType
@@ -137,7 +145,7 @@ log10() const
   * Example: \include Cwise_sqrt.cpp
   * Output: \verbinclude Cwise_sqrt.out
   *
-  * \sa pow(), square()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square()
   */
 EIGEN_DEVICE_FUNC
 inline const SqrtReturnType
@@ -187,7 +195,7 @@ sign() const
   * Example: \include Cwise_cos.cpp
   * Output: \verbinclude Cwise_cos.out
   *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
   */
 EIGEN_DEVICE_FUNC
 inline const CosReturnType
@@ -205,7 +213,7 @@ cos() const
   * Example: \include Cwise_sin.cpp
   * Output: \verbinclude Cwise_sin.out
   *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
   */
 EIGEN_DEVICE_FUNC
 inline const SinReturnType
@@ -219,7 +227,7 @@ sin() const
   * Example: \include Cwise_tan.cpp
   * Output: \verbinclude Cwise_tan.out
   *
-  * \sa cos(), sin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
   */
 EIGEN_DEVICE_FUNC
 inline const TanReturnType
@@ -233,8 +241,9 @@ tan() const
   * Example: \include Cwise_atan.cpp
   * Output: \verbinclude Cwise_atan.out
   *
-  * \sa tan(), asin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
   */
+EIGEN_DEVICE_FUNC
 inline const AtanReturnType
 atan() const
 {
@@ -246,7 +255,7 @@ atan() const
   * Example: \include Cwise_acos.cpp
   * Output: \verbinclude Cwise_acos.out
   *
-  * \sa cos(), asin()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
   */
 EIGEN_DEVICE_FUNC
 inline const AcosReturnType
@@ -260,7 +269,7 @@ acos() const
   * Example: \include Cwise_asin.cpp
   * Output: \verbinclude Cwise_asin.out
   *
-  * \sa sin(), acos()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
   */
 EIGEN_DEVICE_FUNC
 inline const AsinReturnType
@@ -274,8 +283,9 @@ asin() const
   * Example: \include Cwise_tanh.cpp
   * Output: \verbinclude Cwise_tanh.out
   *
-  * \sa tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const TanhReturnType
 tanh() const
 {
@@ -287,8 +297,9 @@ tanh() const
   * Example: \include Cwise_sinh.cpp
   * Output: \verbinclude Cwise_sinh.out
   *
-  * \sa sin(), tanh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const SinhReturnType
 sinh() const
 {
@@ -300,99 +311,15 @@ sinh() const
   * Example: \include Cwise_cosh.cpp
   * Output: \verbinclude Cwise_cosh.out
   *
-  * \sa tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
   */
+EIGEN_DEVICE_FUNC
 inline const CoshReturnType
 cosh() const
 {
   return CoshReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise ln(|gamma(*this)|).
- *
- * Example: \include Cwise_lgamma.cpp
- * Output: \verbinclude Cwise_lgamma.out
- *
- * \sa cos(), sin(), tan()
- */
-inline const LgammaReturnType
-lgamma() const
-{
-  return LgammaReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
- *
- * \sa cos(), sin(), tan()
- */
-inline const DigammaReturnType
-digamma() const
-{
-  return DigammaReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise zeta function.
- */
-inline const ZetaReturnType
-zeta() const
-{
-    return ZetaReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise polygamma function.
- */
-inline const PolygammaReturnType
-polygamma() const
-{
-    return PolygammaReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise Gauss error
- * function of *this.
- *
- * Example: \include Cwise_erf.cpp
- * Output: \verbinclude Cwise_erf.out
- *
- * \sa cos(), sin(), tan()
- */
-inline const ErfReturnType
-erf() const
-{
-  return ErfReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise Complementary error
- * function of *this.
- *
- * Example: \include Cwise_erfc.cpp
- * Output: \verbinclude Cwise_erfc.out
- *
- * \sa cos(), sin(), tan()
- */
-inline const ErfcReturnType
-erfc() const
-{
-  return ErfcReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise power of *this to the given exponent.
-  *
-  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
-  * unsupported module MatrixFunctions computes the matrix power.
-  *
-  * Example: \include Cwise_pow.cpp
-  * Output: \verbinclude Cwise_pow.out
-  *
-  * \sa exp(), log()
-  */
-EIGEN_DEVICE_FUNC
-inline const PowReturnType
-pow(const Scalar& exponent) const
-{
-  return PowReturnType(derived(), internal::scalar_pow_op<Scalar>(exponent));
-}
-
-
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
   * Example: \include Cwise_inverse.cpp
@@ -412,7 +339,7 @@ inverse() const
   * Example: \include Cwise_square.cpp
   * Output: \verbinclude Cwise_square.out
   *
-  * \sa operator/(), operator*(), abs2()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
   */
 EIGEN_DEVICE_FUNC
 inline const SquareReturnType
@@ -426,7 +353,7 @@ square() const
   * Example: \include Cwise_cube.cpp
   * Output: \verbinclude Cwise_cube.out
   *
-  * \sa square(), pow()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
   */
 EIGEN_DEVICE_FUNC
 inline const CubeReturnType
@@ -440,8 +367,9 @@ cube() const
   * Example: \include Cwise_round.cpp
   * Output: \verbinclude Cwise_round.out
   *
-  * \sa ceil(), floor()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
   */
+EIGEN_DEVICE_FUNC
 inline const RoundReturnType
 round() const
 {
@@ -453,8 +381,9 @@ round() const
   * Example: \include Cwise_floor.cpp
   * Output: \verbinclude Cwise_floor.out
   *
-  * \sa ceil(), round()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
   */
+EIGEN_DEVICE_FUNC
 inline const FloorReturnType
 floor() const
 {
@@ -466,8 +395,9 @@ floor() const
   * Example: \include Cwise_ceil.cpp
   * Output: \verbinclude Cwise_ceil.out
   *
-  * \sa floor(), round()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
   */
+EIGEN_DEVICE_FUNC
 inline const CeilReturnType
 ceil() const
 {
@@ -481,6 +411,7 @@ ceil() const
   *
   * \sa isfinite(), isinf()
   */
+EIGEN_DEVICE_FUNC
 inline const IsNaNReturnType
 isNaN() const
 {
@@ -494,6 +425,7 @@ isNaN() const
   *
   * \sa isnan(), isfinite()
   */
+EIGEN_DEVICE_FUNC
 inline const IsInfReturnType
 isInf() const
 {
@@ -507,6 +439,7 @@ isInf() const
   *
   * \sa isnan(), isinf()
   */
+EIGEN_DEVICE_FUNC
 inline const IsFiniteReturnType
 isFinite() const
 {
@@ -530,3 +463,90 @@ operator!() const
                       THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
   return BooleanNotReturnType(derived());
 }
+
+
+// --- SpecialFunctions module ---
+
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+
+/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_lgamma.cpp
+  * Output: \verbinclude Cwise_lgamma.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+  *
+  * \specialfunctions_module
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of digamma(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(), Eigen::polygamma(), lgamma()
+  */
+EIGEN_DEVICE_FUNC
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erf.cpp
+  * Output: \verbinclude Cwise_erf.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+  * function of *this.
+  *
+  * \specialfunctions_module
+  *
+  * Example: \include Cwise_erfc.cpp
+  * Output: \verbinclude Cwise_erfc.out
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
+  * type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
+  */
+EIGEN_DEVICE_FUNC
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
+}
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 632094e15..b76973613 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -10,28 +10,28 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal expression type of a column */
+/// \internal expression type of a column */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
-/** \internal expression type of a row */
+/// \internal expression type of a row */
 typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
 typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
 typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
 template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
 template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
-/** \internal expression of a block */
+/// \internal expression of a block */
 typedef Block<Derived> BlockXpr;
 typedef const Block<const Derived> ConstBlockXpr;
-/** \internal expression of a block of fixed sizes */
+/// \internal expression of a block of fixed sizes */
 template<int Rows, int Cols> struct FixedBlockXpr { typedef Block<Derived,Rows,Cols> Type; };
 template<int Rows, int Cols> struct ConstFixedBlockXpr { typedef Block<const Derived,Rows,Cols> Type; };
 
@@ -42,29 +42,31 @@ template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBloc
 
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns a dynamic-size expression of a block in *this.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  * \param blockRows the number of rows in the block
-  * \param blockCols the number of columns in the block
-  *
-  * Example: \include MatrixBase_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of a block in *this.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+/// \param blockRows the number of rows in the block
+/// \param blockCols the number of columns in the block
+///
+/// Example: \include MatrixBase_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
 {
   return BlockXpr(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block(Index,Index,Index,Index). */
+/// This is the const version of block(Index,Index,Index,Index). */
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
 {
@@ -74,39 +76,43 @@ inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows
 
 
 
-/** \returns a dynamic-size expression of a top-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a top-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr topRightCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner(Index, Index).*/
+/// This is the const version of topRightCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size top-right corner of *this.
-  *
-  * \tparam CRows the number of rows in the corner
-  * \tparam CCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
-  *
-  * \sa class Block, block<int,int>(Index,Index)
-  */
+/// \returns an expression of a fixed-size top-right corner of *this.
+///
+/// \tparam CRows the number of rows in the corner
+/// \tparam CCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block<int,int>(Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
@@ -114,7 +120,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** This is the const version of topRightCorner<int, int>().*/
+/// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
@@ -122,30 +128,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() con
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** \returns an expression of a top-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner<int, int>(Index, Index).*/
+/// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
@@ -154,38 +162,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index
 
 
 
-/** \returns a dynamic-size expression of a top-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a top-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr topLeftCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner(Index, Index).*/
+/// This is the const version of topLeftCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size top-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size top-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
@@ -193,7 +205,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** This is the const version of topLeftCorner<int, int>().*/
+/// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
@@ -201,30 +213,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() cons
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** \returns an expression of a top-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
@@ -233,38 +247,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index
 
 
 
-/** \returns a dynamic-size expression of a bottom-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a bottom-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr bottomRightCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner(Index, Index).*/
+/// This is the const version of bottomRightCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-right corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-right corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
@@ -272,7 +290,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>().*/
+/// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
@@ -280,30 +298,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** \returns an expression of a bottom-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
@@ -312,38 +332,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(In
 
 
 
-/** \returns a dynamic-size expression of a bottom-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a bottom-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr bottomLeftCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner(Index, Index).*/
+/// This is the const version of bottomLeftCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
@@ -351,7 +375,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>().*/
+/// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
@@ -359,30 +383,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() c
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** \returns an expression of a bottom-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
@@ -391,41 +417,45 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Ind
 
 
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_topRows_int.cpp
-  * Output: \verbinclude MatrixBase_topRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_topRows_int.cpp
+/// Output: \verbinclude MatrixBase_topRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr topRows(Index n)
 {
   return RowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows(Index).*/
+/// This is the const version of topRows(Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr topRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_topRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_topRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_topRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_topRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
@@ -433,7 +463,7 @@ inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows<int>().*/
+/// This is the const version of topRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
@@ -443,41 +473,45 @@ inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 
 
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_bottomRows_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_bottomRows_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr bottomRows(Index n)
 {
   return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows(Index).*/
+/// This is the const version of bottomRows(Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr bottomRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_bottomRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_bottomRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_bottomRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
@@ -485,7 +519,7 @@ inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows<int>().*/
+/// This is the const version of bottomRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
@@ -495,43 +529,47 @@ inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block
-  *
-  * Example: \include DenseBase_middleRows_int.cpp
-  * Output: \verbinclude DenseBase_middleRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block
+///
+/// Example: \include DenseBase_middleRows_int.cpp
+/// Output: \verbinclude DenseBase_middleRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr middleRows(Index startRow, Index n)
 {
   return RowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows(Index,Index).*/
+/// This is the const version of middleRows(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 {
   return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleRows.cpp
-  * Output: \verbinclude DenseBase_template_int_middleRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleRows.cpp
+/// Output: \verbinclude DenseBase_template_int_middleRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
@@ -539,7 +577,7 @@ inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows<int>().*/
+/// This is the const version of middleRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
@@ -549,41 +587,45 @@ inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n =
 
 
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_leftCols_int.cpp
-  * Output: \verbinclude MatrixBase_leftCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_leftCols_int.cpp
+/// Output: \verbinclude MatrixBase_leftCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr leftCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols(Index).*/
+/// This is the const version of leftCols(Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr leftCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_leftCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_leftCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_leftCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_leftCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
@@ -591,7 +633,7 @@ inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols<int>().*/
+/// This is the const version of leftCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
@@ -601,41 +643,45 @@ inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 
 
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_rightCols_int.cpp
-  * Output: \verbinclude MatrixBase_rightCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_rightCols_int.cpp
+/// Output: \verbinclude MatrixBase_rightCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr rightCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols(Index).*/
+/// This is the const version of rightCols(Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr rightCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_rightCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_rightCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_rightCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_rightCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
@@ -643,7 +689,7 @@ inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols<int>().*/
+/// This is the const version of rightCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
@@ -653,43 +699,47 @@ inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \param startCol the index of the first column in the block
-  * \param numCols the number of columns in the block
-  *
-  * Example: \include DenseBase_middleCols_int.cpp
-  * Output: \verbinclude DenseBase_middleCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \param startCol the index of the first column in the block
+/// \param numCols the number of columns in the block
+///
+/// Example: \include DenseBase_middleCols_int.cpp
+/// Output: \verbinclude DenseBase_middleCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr middleCols(Index startCol, Index numCols)
 {
   return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** This is the const version of middleCols(Index,Index).*/
+/// This is the const version of middleCols(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 {
   return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param startCol the index of the first column in the block
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleCols.cpp
-  * Output: \verbinclude DenseBase_template_int_middleCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param startCol the index of the first column in the block
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleCols.cpp
+/// Output: \verbinclude DenseBase_template_int_middleCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
@@ -697,7 +747,7 @@ inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
-/** This is the const version of middleCols<int>().*/
+/// This is the const version of middleCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
@@ -707,22 +757,24 @@ inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n =
 
 
 
-/** \returns a fixed-size expression of a block in *this.
-  *
-  * The template parameters \a NRows and \a NCols are the number of
-  * rows and columns in the block.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  *
-  * Example: \include MatrixBase_block_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int.out
-  *
-  * \note since block is a templated member, the keyword template has to be used
-  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a fixed-size expression of a block in *this.
+///
+/// The template parameters \a NRows and \a NCols are the number of
+/// rows and columns in the block.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+///
+/// Example: \include MatrixBase_block_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int.out
+///
+/// \note since block is a templated member, the keyword template has to be used
+/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int NRows, int NCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
@@ -730,7 +782,7 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** This is the const version of block<>(Index, Index). */
+/// This is the const version of block<>(Index, Index). */
 template<int NRows, int NCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
@@ -738,25 +790,27 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** \returns an expression of a block in *this.
-  *
-  * \tparam NRows number of rows in block as specified at compile-time
-  * \tparam NCols number of columns in block as specified at compile-time
-  * \param  startRow  the first row in the block
-  * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run-time
-  * \param  blockCols number of columns in block as specified at run-time
-  *
-  * This function is mainly useful for blocks where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a blockRows should equal \a NRows unless
-  * \a NRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a block in *this.
+///
+/// \tparam NRows number of rows in block as specified at compile-time
+/// \tparam NCols number of columns in block as specified at compile-time
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in block as specified at run-time
+/// \param  blockCols number of columns in block as specified at run-time
+///
+/// This function is mainly useful for blocks where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a blockRows should equal \a NRows unless
+/// \a NRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int NRows, int NCols>
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
@@ -764,7 +818,7 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block<>(Index, Index, Index, Index). */
+/// This is the const version of block<>(Index, Index, Index, Index). */
 template<int NRows, int NCols>
 inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
@@ -772,60 +826,64 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_col.cpp
-  * Output: \verbinclude MatrixBase_col.out
-  *
-  * \sa row(), class Block */
+/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_col.cpp
+/// Output: \verbinclude MatrixBase_col.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa row(), class Block */
 EIGEN_DEVICE_FUNC
 inline ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
-/** This is the const version of col(). */
+/// This is the const version of col(). */
 EIGEN_DEVICE_FUNC
 inline ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_row.cpp
-  * Output: \verbinclude MatrixBase_row.out
-  *
-  * \sa col(), class Block */
+/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_row.cpp
+/// Output: \verbinclude MatrixBase_row.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa col(), class Block */
 EIGEN_DEVICE_FUNC
 inline RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
-/** This is the const version of row(). */
+/// This is the const version of row(). */
 EIGEN_DEVICE_FUNC
 inline ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
-  *
-  * \only_for_vectors
-  *
-  * \param start the first coefficient in the segment
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_segment_int_int.cpp
-  * Output: \verbinclude MatrixBase_segment_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, segment(Index)
-  */
+/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
+///
+/// \only_for_vectors
+///
+/// \param start the first coefficient in the segment
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_segment_int_int.cpp
+/// Output: \verbinclude MatrixBase_segment_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, segment(Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType segment(Index start, Index n)
 {
@@ -834,7 +892,7 @@ inline SegmentReturnType segment(Index start, Index n)
 }
 
 
-/** This is the const version of segment(Index,Index).*/
+/// This is the const version of segment(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType segment(Index start, Index n) const
 {
@@ -842,21 +900,21 @@ inline ConstSegmentReturnType segment(Index start, Index n) const
   return ConstSegmentReturnType(derived(), start, n);
 }
 
-/** \returns a dynamic-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_start_int.cpp
-  * Output: \verbinclude MatrixBase_start_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_start_int.cpp
+/// Output: \verbinclude MatrixBase_start_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType head(Index n)
 {
@@ -864,7 +922,7 @@ inline SegmentReturnType head(Index n)
   return SegmentReturnType(derived(), 0, n);
 }
 
-/** This is the const version of head(Index).*/
+/// This is the const version of head(Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType head(Index n) const
 {
@@ -872,21 +930,21 @@ inline ConstSegmentReturnType head(Index n) const
   return ConstSegmentReturnType(derived(), 0, n);
 }
 
-/** \returns a dynamic-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_end_int.cpp
-  * Output: \verbinclude MatrixBase_end_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_end_int.cpp
+/// Output: \verbinclude MatrixBase_end_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType tail(Index n)
 {
@@ -894,7 +952,7 @@ inline SegmentReturnType tail(Index n)
   return SegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** This is the const version of tail(Index).*/
+/// This is the const version of tail(Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType tail(Index n) const
 {
@@ -902,22 +960,22 @@ inline ConstSegmentReturnType tail(Index n) const
   return ConstSegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param start the index of the first element in the segment
-  * \param n the number of coefficients in the segment as specified at compile-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_segment.cpp
-  * Output: \verbinclude MatrixBase_template_int_segment.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param start the index of the first element in the segment
+/// \param n the number of coefficients in the segment as specified at compile-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_segment.cpp
+/// Output: \verbinclude MatrixBase_template_int_segment.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
@@ -926,7 +984,7 @@ inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** This is the const version of segment<int>(Index).*/
+/// This is the const version of segment<int>(Index).
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
@@ -935,21 +993,21 @@ inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** \returns a fixed-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_start.cpp
-  * Output: \verbinclude MatrixBase_template_int_start.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_start.cpp
+/// Output: \verbinclude MatrixBase_template_int_start.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
@@ -958,7 +1016,7 @@ inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** This is the const version of head<int>().*/
+/// This is the const version of head<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
@@ -967,21 +1025,21 @@ inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** \returns a fixed-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_end.cpp
-  * Output: \verbinclude MatrixBase_template_int_end.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_end.cpp
+/// Output: \verbinclude MatrixBase_template_int_end.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
@@ -990,7 +1048,7 @@ inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
 
-/** This is the const version of tail<int>.*/
+/// This is the const version of tail<int>.
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
diff --git a/Eigen/src/plugins/CMakeLists.txt b/Eigen/src/plugins/CMakeLists.txt
deleted file mode 100644
index 1a1d3ffbd..000000000
--- a/Eigen/src/plugins/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_plugins_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_plugins_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/plugins COMPONENT Devel
-  )
diff --git a/Eigen/src/plugins/CommonCwiseBinaryOps.h b/Eigen/src/plugins/CommonCwiseBinaryOps.h
index a8fa287c9..b51ee9e4c 100644
--- a/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -16,7 +16,7 @@
   *
   * \sa class CwiseBinaryOp, operator-=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator-,difference)
 
 /** \returns an expression of the sum of \c *this and \a other
   *
@@ -24,7 +24,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
   *
   * \sa class CwiseBinaryOp, operator+=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
+EIGEN_MAKE_CWISE_BINARY_OP(operator+,sum)
 
 /** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
   *
@@ -45,3 +45,33 @@ binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other, const Cu
   return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
 }
 
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator*,product)
+#else
+/** \returns an expression of \c *this scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_product_op<Scalar,T>,Derived,Constant<T> > operator*(const T& scalar) const;
+/** \returns an expression of \a expr scaled by the scalar factor \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T> friend
+const CwiseBinaryOp<internal::scalar_product_op<T,Scalar>,Constant<T>,Derived> operator*(const T& scalar, const StorageBaseType& expr);
+#endif
+
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient)
+#else
+/** \returns an expression of \c *this divided by the scalar value \a scalar
+  *
+  * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+  */
+template<typename T>
+const CwiseBinaryOp<internal::scalar_quotient_op<Scalar,T>,Derived,Constant<T> > operator/(const T& scalar) const;
+#endif
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 050bce03c..89f4faaac 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -12,12 +12,6 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal Represents a scalar multiple of an expression */
-typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
-typedef CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived> ScalarComplexMultipleReturnType;
-
-/** \internal Represents a quotient of an expression by a scalar*/
-typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
 /** \internal the return type of conjugate() */
 typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                     const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
@@ -39,65 +33,29 @@ typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturn
 typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
 
 typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> NegativeReturnType;
-//typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
 
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns an expression of the opposite of \c *this
-  */
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
+///
 EIGEN_DEVICE_FUNC
 inline const NegativeReturnType
 operator-() const { return NegativeReturnType(derived()); }
 
 
-/** \returns an expression of \c *this scaled by the scalar factor \a scalar */
-EIGEN_DEVICE_FUNC
-inline const ScalarMultipleReturnType
-operator*(const Scalar& scalar) const
-{
-  return ScalarMultipleReturnType(derived(), internal::scalar_multiple_op<Scalar>(scalar));
-}
-
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
-#endif
-
-/** \returns an expression of \c *this divided by the scalar value \a scalar */
-EIGEN_DEVICE_FUNC
-inline const ScalarQuotient1ReturnType
-operator/(const Scalar& scalar) const
-{
-  return ScalarQuotient1ReturnType(derived(), internal::scalar_quotient1_op<Scalar>(scalar));
-}
-
-/** Overloaded for efficient real matrix times complex scalar value */
-EIGEN_DEVICE_FUNC
-inline const ScalarComplexMultipleReturnType
-operator*(const std::complex<Scalar>& scalar) const
-{
-  return ScalarComplexMultipleReturnType(derived(), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
-}
-
-EIGEN_DEVICE_FUNC
-inline friend const ScalarMultipleReturnType
-operator*(const Scalar& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-EIGEN_DEVICE_FUNC
-inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-
 template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
 
-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
 template<typename NewType>
 EIGEN_DEVICE_FUNC
 typename CastXpr<NewType>::Type
@@ -106,9 +64,11 @@ cast() const
   return typename CastXpr<NewType>::Type(derived());
 }
 
-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa adjoint() */
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
@@ -116,39 +76,45 @@ conjugate() const
   return ConjugateReturnType(derived());
 }
 
-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline RealReturnType
 real() const { return RealReturnType(derived()); }
 
-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline const ImagReturnType
 imag() const { return ImagReturnType(derived()); }
 
-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
 template<typename CustomUnaryOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@@ -157,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
 
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
 template<typename CustomViewOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
@@ -176,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
 
-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
 real() { return NonConstRealReturnType(derived()); }
 
-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
 imag() { return NonConstImagReturnType(derived()); }
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index 6dd2e1192..f1084abef 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -19,10 +19,10 @@
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
+EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)
 cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
@@ -74,10 +74,10 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and scalar \a other
@@ -85,7 +85,7 @@ cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa class CwiseBinaryOp, min()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
   return cwiseMin(Derived::Constant(rows(), cols(), other));
@@ -100,10 +100,10 @@ cwiseMin(const Scalar &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise max of *this and scalar \a other
@@ -111,7 +111,7 @@ cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa class CwiseBinaryOp, min()
   */
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
   return cwiseMax(Derived::Constant(rows(), cols(), other));
@@ -133,7 +133,7 @@ cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
 
 /** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
   *
@@ -148,5 +148,5 @@ EIGEN_DEVICE_FUNC
 inline const CwiseScalarEqualReturnType
 cwiseEqual(const Scalar& s) const
 {
-  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,internal::cmp_EQ>());
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,Scalar,internal::cmp_EQ>());
 }
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index e16bb374b..b1be3d566 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -11,63 +11,75 @@
 // This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
 // This include MatrixBase and SparseMatrixBase.
 
+
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
+///
+/// \sa cwiseAbs2()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbsReturnType
 cwiseAbs() const { return CwiseAbsReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
+///
+/// \sa cwiseAbs()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
 cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
+///
+/// \sa cwisePow(), cwiseSquare()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise signum of *this.
-  *
-  * Example: \include MatrixBase_cwiseSign.cpp
-  * Output: \verbinclude MatrixBase_cwiseSign.out
-  *
-  */
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSignReturnType
 cwiseSign() const { return CwiseSignReturnType(derived()); }
 
 
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
+///
+/// \sa cwiseProduct()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
+
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp
index 42b3e1285..9a8e7cf63 100644
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   int rows = m.rows();
   int cols = m.cols();
 
-  int cost = 0;
+  double cost = 0;
   for (int j=0; j<rows; ++j)
   {
     int r = std::max(rows - j -1,0);
@@ -78,10 +78,10 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
   else
     std::cout << "fixed ";
   std::cout << covMat.rows() << " \t"
-            << (timerNoSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerNoSqrt.value() << " MFLOPS)\t"
-            << (timerSqrt.value() * REPEAT) / repeats << "s "
-            << "(" << 1e-6 * cost*repeats/timerSqrt.value() << " MFLOPS)\n";
+            << (timerNoSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerNoSqrt.best() << " GFLOPS)\t"
+            << (timerSqrt.best()) / repeats << "s "
+            << "(" << 1e-9 * cost*repeats/timerSqrt.best() << " GFLOPS)\n";
 
 
   #ifdef BENCH_GSL
@@ -119,13 +119,13 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m)
 
 int main(int argc, char* argv[])
 {
-  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,0};
-  std::cout << "size            no sqrt                           standard";
+  const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,1500,0};
+  std::cout << "size            LDLT                            LLT";
 //   #ifdef BENCH_GSL
 //   std::cout << "       GSL (standard + double + ATLAS)  ";
 //   #endif
   std::cout << "\n";
-  for (uint i=0; dynsizes[i]>0; ++i)
+  for (int i=0; dynsizes[i]>0; ++i)
     benchLLT(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i]));
 
   benchLLT(Matrix<Scalar,2,2>());
diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt
index f8b1b2ec3..e99a0855c 100644
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@@ -1,10 +1,13 @@
 
 find_package(BLAZE)
-find_package(Boost)
+find_package(Boost COMPONENTS system)
 if (BLAZE_FOUND AND Boost_FOUND)
   include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
   btl_add_bench(btl_blaze main.cpp)
+  # Note: The newest blaze version requires C++14.
+  # Ideally, we should set this depending on the version of Blaze we found
+  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
   if(BUILD_btl_blaze)
-    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
   endif()
 endif ()
diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp
index aa4ff011f..24343dcd8 100644
--- a/bench/dense_solvers.cpp
+++ b/bench/dense_solvers.cpp
@@ -2,47 +2,74 @@
 #include "BenchTimer.h"
 #include <Eigen/Dense>
 #include <map>
+#include <vector>
 #include <string>
+#include <sstream>
 using namespace Eigen;
 
-std::map<std::string,Array<float,1,4> > results;
+std::map<std::string,Array<float,1,8,DontAlign|RowMajor> > results;
+std::vector<std::string> labels;
+std::vector<Array2i> sizes;
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute_norm_equation(Solver &solver, const MatrixType &A) {
+  if(A.rows()!=A.cols())
+    solver.compute(A.transpose()*A);
+  else
+    solver.compute(A);
+}
+
+template<typename Solver,typename MatrixType>
+EIGEN_DONT_INLINE
+void compute(Solver &solver, const MatrixType &A) {
+  solver.compute(A);
+}
 
 template<typename Scalar,int Size>
-void bench(int id, int size = Size)
+void bench(int id, int rows, int size = Size)
 {
-  typedef Matrix<Scalar,Size,Size> Mat;
-  Mat A(size,size);
+  typedef Matrix<Scalar,Dynamic,Size> Mat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatDyn;
+  typedef Matrix<Scalar,Size,Size> MatSquare;
+  Mat A(rows,size);
   A.setRandom();
-  A = A*A.adjoint();
+  if(rows==size)
+    A = A*A.adjoint();
   BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd;
+
+  int svd_opt = ComputeThinU|ComputeThinV;
   
-  int tries = 3;
+  int tries = 5;
   int rep = 1000/size;
   if(rep==0) rep = 1;
 //   rep = rep*rep;
   
-  LLT<Mat> llt(A);
-  LDLT<Mat> ldlt(A);
-  PartialPivLU<Mat> lu(A);
-  FullPivLU<Mat> fplu(A);
-  HouseholderQR<Mat> qr(A);
-  ColPivHouseholderQR<Mat> cpqr(A);
-  CompleteOrthogonalDecomposition<Mat> cod(A);
-  FullPivHouseholderQR<Mat> fpqr(A);
-  JacobiSVD<Mat> jsvd(A.rows(),A.cols());
-  BDCSVD<Mat> bdcsvd(A.rows(),A.cols());
+  LLT<MatSquare> llt(size);
+  LDLT<MatSquare> ldlt(size);
+  PartialPivLU<MatSquare> lu(size);
+  FullPivLU<MatSquare> fplu(size,size);
+  HouseholderQR<Mat> qr(A.rows(),A.cols());
+  ColPivHouseholderQR<Mat> cpqr(A.rows(),A.cols());
+  CompleteOrthogonalDecomposition<Mat> cod(A.rows(),A.cols());
+  FullPivHouseholderQR<Mat> fpqr(A.rows(),A.cols());
+  JacobiSVD<MatDyn> jsvd(A.rows(),A.cols());
+  BDCSVD<MatDyn> bdcsvd(A.rows(),A.cols());
   
-  BENCH(t_llt, tries, rep, llt.compute(A));
-  BENCH(t_ldlt, tries, rep, ldlt.compute(A));
-  BENCH(t_lu, tries, rep, lu.compute(A));
-  BENCH(t_fplu, tries, rep, fplu.compute(A));
-  BENCH(t_qr, tries, rep, qr.compute(A));
-  BENCH(t_cpqr, tries, rep, cpqr.compute(A));
-  BENCH(t_cod, tries, rep, cod.compute(A));
-  BENCH(t_fpqr, tries, rep, fpqr.compute(A));
+  BENCH(t_llt, tries, rep, compute_norm_equation(llt,A));
+  BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A));
+  BENCH(t_lu, tries, rep, compute_norm_equation(lu,A));
+  if(size<=1000)
+    BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A));
+  BENCH(t_qr, tries, rep, compute(qr,A));
+  BENCH(t_cpqr, tries, rep, compute(cpqr,A));
+  BENCH(t_cod, tries, rep, compute(cod,A));
+  if(size*rows<=10000000)
+    BENCH(t_fpqr, tries, rep, compute(fpqr,A));
   if(size<500) // JacobiSVD is really too slow for too large matrices
-    BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
-  BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV));
+    BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt));
+//   if(size*rows<=20000000)
+    BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt));
   
   results["LLT"][id] = t_llt.best();
   results["LDLT"][id] = t_ldlt.best();
@@ -52,33 +79,108 @@ void bench(int id, int size = Size)
   results["ColPivHouseholderQR"][id] = t_cpqr.best();
   results["CompleteOrthogonalDecomposition"][id] = t_cod.best();
   results["FullPivHouseholderQR"][id] = t_fpqr.best();
-  results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
+  results["JacobiSVD"][id] = t_jsvd.best();
   results["BDCSVD"][id] = t_bdcsvd.best();
 }
 
+
 int main()
 {
+  labels.push_back("LLT");
+  labels.push_back("LDLT");
+  labels.push_back("PartialPivLU");
+  labels.push_back("FullPivLU");
+  labels.push_back("HouseholderQR");
+  labels.push_back("ColPivHouseholderQR");
+  labels.push_back("CompleteOrthogonalDecomposition");
+  labels.push_back("FullPivHouseholderQR");
+  labels.push_back("JacobiSVD");
+  labels.push_back("BDCSVD");
+
+  for(int i=0; i<labels.size(); ++i)
+    results[labels[i]].fill(-1);
+
   const int small = 8;
-  const int medium = 100;
-  const int large = 1000;
-  const int xl = 4000;
-  
-  bench<float,small>(0);
-  bench<float,Dynamic>(1,medium);
-  bench<float,Dynamic>(2,large);
-  bench<float,Dynamic>(3,xl);
-  
-  IOFormat fmt(3, 0, " \t", "\n", "", "");
-  
-  std::cout << "solver/size                           " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
-  std::cout << "LLT                             (ms)  " << (results["LLT"]/1000.).format(fmt) << "\n";
-  std::cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
-  std::cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
+  sizes.push_back(Array2i(small,small));
+  sizes.push_back(Array2i(100,100));
+  sizes.push_back(Array2i(1000,1000));
+  sizes.push_back(Array2i(4000,4000));
+  sizes.push_back(Array2i(10000,small));
+  sizes.push_back(Array2i(10000,100));
+  sizes.push_back(Array2i(10000,1000));
+  sizes.push_back(Array2i(10000,4000));
+
+  using namespace std;
+
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    cout << sizes[k](0) << "x" << sizes[k](1) << "...\n";
+    bench<float,Dynamic>(k,sizes[k](0),sizes[k](1));
+  }
+
+  cout.width(32);
+  cout << "solver/size";
+  cout << "  ";
+  for(int k=0; k<sizes.size(); ++k)
+  {
+    std::stringstream ss;
+    ss << sizes[k](0) << "x" << sizes[k](1);
+    cout.width(10); cout << ss.str(); cout << " ";
+  }
+  cout << endl;
+
+
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout.width(32); cout << labels[i]; cout << "  ";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      cout.width(10);
+      if(r(k)>=1e6)  cout << "-";
+      else           cout << r(k);
+      cout << " ";
+    }
+    cout << endl;
+  }
+
+  // HTML output
+  cout << "<table class=\"manual\">" << endl;
+  cout << "<tr><th>solver/size</th>" << endl;
+  for(int k=0; k<sizes.size(); ++k)
+    cout << "  <th>" << sizes[k](0) << "x" << sizes[k](1) << "</th>";
+  cout << "</tr>" << endl;
+  for(int i=0; i<labels.size(); ++i)
+  {
+    cout << "<tr";
+    if(i%2==1) cout << " class=\"alt\"";
+    cout << "><td>" << labels[i] << "</td>";
+    ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f;
+    for(int k=0; k<sizes.size(); ++k)
+    {
+      if(r(k)>=1e6) cout << "<td>-</td>";
+      else
+      {
+        cout << "<td>" << r(k);
+        if(i>0)
+          cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")";
+        if(i<4 && sizes[k](0)!=sizes[k](1))
+          cout << " <sup><a href=\"#note_ls\">*</a></sup>";
+        cout << "</td>";
+      }
+    }
+    cout << "</tr>" << endl;
+  }
+  cout << "</table>" << endl;
+
+//   cout << "LLT                             (ms)  " << (results["LLT"]*1000.).format(fmt) << "\n";
+//   cout << "LDLT                             (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "PartialPivLU                     (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivLU                        (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "HouseholderQR                    (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "ColPivHouseholderQR              (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "CompleteOrthogonalDecomposition  (%)  " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "FullPivHouseholderQR             (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "JacobiSVD                        (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
+//   cout << "BDCSVD                           (%)  " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n";
 }
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index fb3e48e99..af8eb9b8f 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -42,6 +42,20 @@ before-evaluators
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
+7016:a58d253e8c91   # Polish lookup tables generation
+7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
+7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
+7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
+7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
 7591:09a8e2186610   # 3.3-alpha1
 7650:b0f3c8f43025   # help clang inlining
-
+#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
+8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
+8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
+8985:d935df21a082   # Remove the rotating kernel.
+8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
+9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
+9174:d228bc282ac9   # merge
+9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
+9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
index b443218d7..6dc370155 100644
--- a/bench/perf_monitoring/gemm/lazy_gemm.cpp
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -12,12 +12,13 @@ using namespace Eigen;
 typedef SCALAR Scalar;
 
 template<typename MatA, typename MatB, typename MatC>
-inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+EIGEN_DONT_INLINE
+void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
 {
-  escape((void*)A.data());
-  escape((void*)B.data());
+//   escape((void*)A.data());
+//   escape((void*)B.data());
   C.noalias() += A.lazyProduct(B);
-  escape((void*)C.data());
+//   escape((void*)C.data());
 }
 
 template<int m, int n, int k, int TA>
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
index 4d6053501..cd3214ac9 100755
--- a/bench/perf_monitoring/gemm/make_plot.sh
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -25,7 +25,7 @@ echo "set xtics rotate 1" >> $WHAT.gnuplot
 echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
 echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
 
-col=`cat settings.txt | wc -l`
+col=`cat $bench"_settings.txt" | wc -l`
 echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
 echo " " >>  $WHAT.gnuplot
 
diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh
index bfb4ecfac..9d6ee40bc 100755
--- a/bench/perf_monitoring/gemm/run.sh
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -138,15 +138,15 @@ do
 done
 
 echo "Float:"
-cat $PREFIX"s"$bench.out"
-echo ""
+cat $PREFIX"s""$bench.out"
+echo " "
 
 echo "Double:"
-cat $PREFIX"d"$bench.out"
+cat $PREFIX"d""$bench.out"
 echo ""
 
 echo "Complex:"
-cat $PREFIX"c"$bench.out"
+cat $PREFIX"c""$bench.out"
 echo ""
 
 ./make_plot.sh $PREFIX"s"$bench $bench
diff --git a/bench/tensors/README b/bench/tensors/README
index 4398aa81b..803cb8ef8 100644
--- a/bench/tensors/README
+++ b/bench/tensors/README
@@ -1,12 +1,15 @@
-Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+The tensor benchmark suite is made of several parts.
+
+The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
 
 To compile the floating point CPU benchmarks, simply call:
 g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
 
 To compile the floating point GPU benchmarks, simply call:
-nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu
 
+We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
+nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
-To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
-nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu
-
+last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call 
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc
new file mode 100644
index 000000000..f9e57ad47
--- /dev/null
+++ b/bench/tensors/contraction_benchmarks_cpu.cc
@@ -0,0 +1,39 @@
+#define EIGEN_USE_THREADS
+
+#include <string>
+
+#include "tensor_benchmarks.h"
+
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
+
+
+// Contractions for number of threads ranging from 1 to 32
+// Dimensions are Rows, Cols, Depth
+#define BM_ContractionCPU(D1, D2, D3)                                         \
+  static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \
+    StopBenchmarkTiming();                                                    \
+    CREATE_THREAD_POOL(Threads);                                              \
+    BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
+    suite.contraction(iters);                                                 \
+  }                                                                           \
+  BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32);
+
+
+// Vector Matrix and Matrix Vector products
+BM_ContractionCPU(1, 2000, 500);
+BM_ContractionCPU(2000, 1, 500);
+
+// Various skinny matrices
+BM_ContractionCPU(250, 3, 512);
+BM_ContractionCPU(1500, 3, 512);
+
+BM_ContractionCPU(512, 800, 4);
+BM_ContractionCPU(512, 80, 800);
+BM_ContractionCPU(512, 80, 13522);
+BM_ContractionCPU(1, 80, 13522);
+
+BM_ContractionCPU(3200, 512, 4);
+BM_ContractionCPU(3200, 512, 80);
+BM_ContractionCPU(3200, 80, 512);
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 62533a608..c2fb3dede 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -178,9 +178,14 @@ template <typename Device, typename T> class BenchmarkSuite {
     size_b[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
 
+#if defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
+                         Eigen::type2indexpair<2, 1> > paddings;
+#else
     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+#endif
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -368,7 +373,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
         b_, input_size);
     Eigen::array<TensorIndex, 0> output_size;
-    TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
+    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
 
     StartBenchmarkTiming();
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 14876556e..65784d0d6 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc);
 BM_FuncGPU(transcendentalFunc);
 BM_FuncGPU(rowReduction);
 BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
 
 
 // Contractions
diff --git a/blas/PackedTriangularMatrixVector.h b/blas/PackedTriangularMatrixVector.h
index e9886d56f..0039536a8 100644
--- a/blas/PackedTriangularMatrixVector.h
+++ b/blas/PackedTriangularMatrixVector.h
@@ -18,7 +18,7 @@ struct packed_triangular_matrix_vector_product;
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
 struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower     = (Mode & Lower)   ==Lower,
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
@@ -47,7 +47,7 @@ struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsS
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs>
 struct packed_triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor>
 {
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   enum {
     IsLower     = (Mode & Lower)   ==Lower,
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
diff --git a/blas/single.cpp b/blas/single.cpp
index 836e3eee2..20ea57d5c 100644
--- a/blas/single.cpp
+++ b/blas/single.cpp
@@ -19,4 +19,4 @@
 #include "level3_impl.h"
 
 float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
-{ return *alpha + BLASFUNC(dsdot)(n, x, incx, y, incy); }
+{ return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); }
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 206f2d93d..f53f46087 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -440,6 +440,8 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
     set(${CNAME} "llvm-g++")
   elseif((ei_has_llvm) AND (ei_has_clang))
     set(${CNAME} "llvm-clang++")
+  elseif(ei_has_clang)
+    set(${CNAME} "clang++")
   elseif(ei_has_icpc)
     set(${CNAME} "icpc")
   elseif(ei_has_gpp OR ei_has_gcc)
diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake
index cea1afeab..9e9697860 100644
--- a/cmake/FindEigen3.cmake
+++ b/cmake/FindEigen3.cmake
@@ -66,16 +66,23 @@ if (EIGEN3_INCLUDE_DIR)
   set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
 
 else (EIGEN3_INCLUDE_DIR)
+  
+  # search first if an Eigen3Config.cmake is available in the system,
+  # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
+  # the script will work as usual
+  find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET)
 
-  find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
-      HINTS
-      ENV EIGEN3_ROOT 
-      ENV EIGEN3_ROOT_DIR
-      PATHS
-      ${CMAKE_INSTALL_PREFIX}/include
-      ${KDE4_INCLUDE_DIR}
-      PATH_SUFFIXES eigen3 eigen
-    )
+  if(NOT EIGEN3_INCLUDE_DIR)
+    find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+        HINTS
+        ENV EIGEN3_ROOT 
+        ENV EIGEN3_ROOT_DIR
+        PATHS
+        ${CMAKE_INSTALL_PREFIX}/include
+        ${KDE4_INCLUDE_DIR}
+        PATH_SUFFIXES eigen3 eigen
+      )
+  endif(NOT EIGEN3_INCLUDE_DIR)
 
   if(EIGEN3_INCLUDE_DIR)
     _eigen3_check_version()
diff --git a/cmake/FindSuperLU.cmake b/cmake/FindSuperLU.cmake
index e4142fe4d..f38146e06 100644
--- a/cmake/FindSuperLU.cmake
+++ b/cmake/FindSuperLU.cmake
@@ -17,7 +17,10 @@ find_path(SUPERLU_INCLUDES
   SRC
 )
 
-find_library(SUPERLU_LIBRARIES NAMES "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu" PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib)
+find_library(SUPERLU_LIBRARIES
+  NAMES "superlu_5.2.1" "superlu_5.2" "superlu_5.1.1" "superlu_5.1" "superlu_5.0" "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu"
+  PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR}
+  PATH_SUFFIXES lib)
 
 if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
 
@@ -48,11 +51,25 @@ int main() {
 }"
 SUPERLU_HAS_CLEAN_ENUMS)
 
-if(SUPERLU_HAS_CLEAN_ENUMS)
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main(void)
+{
+  GlobalLU_t glu;
+  return 0;
+}"
+SUPERLU_HAS_GLOBALLU_T)
+
+if(SUPERLU_HAS_GLOBALLU_T)
+  # at least 5.0
+  set(SUPERLU_VERSION_VAR "5.0")
+elseif(SUPERLU_HAS_CLEAN_ENUMS)
   # at least 4.3
   set(SUPERLU_VERSION_VAR "4.3")
 elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
-  # at least 4.3
+  # at least 4.0
   set(SUPERLU_VERSION_VAR "4.0")
 else()
   set(SUPERLU_VERSION_VAR "3.0")
diff --git a/doc/A05_PortingFrom2To3.dox b/doc/A05_PortingFrom2To3.dox
index 0dbddb976..51555f996 100644
--- a/doc/A05_PortingFrom2To3.dox
+++ b/doc/A05_PortingFrom2To3.dox
@@ -261,7 +261,7 @@ use it unless you are sure of what you are doing, i.e., you have rigourosly meas
 
 The EIGEN_ALIGN_128 macro has been renamed to EIGEN_ALIGN16. Don't be surprised, it's just that we switched to counting in bytes ;-)
 
-The EIGEN_DONT_ALIGN option still exists in Eigen 3, but it has a new cousin: EIGEN_DONT_ALIGN_STATICALLY. It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays, thus keeping vectorization for dynamic-size objects.
+The \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN \endlink option still exists in Eigen 3, but it has a new cousin: \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_ALIGN_STATICALLY.\endlink It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays. Vectorization of statically allocated arrays is still preserved (unless you define \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink =0), at the cost of unaligned memory stores.
 
 \section AlignedMap Aligned Map objects
 
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 4d01a0424..db413bc65 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -78,6 +78,8 @@ add_custom_target(
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/AsciiQuickReference.txt          ${CMAKE_CURRENT_BINARY_DIR}/html/
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
@@ -88,6 +90,8 @@ add_custom_target(
   COMMAND ${CMAKE_COMMAND} -E make_directory ${Eigen_BINARY_DIR}/doc/html/unsupported
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/eigen_navtree_hacks.js           ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/Eigen_Silly_Professor_64x64.png  ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2pnode.png                    ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ftv2node.png                     ${CMAKE_CURRENT_BINARY_DIR}/html/unsupported/
   WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc
 )
 
diff --git a/doc/CoeffwiseMathFunctionsTable.dox b/doc/CoeffwiseMathFunctionsTable.dox
new file mode 100644
index 000000000..ac6e0bd31
--- /dev/null
+++ b/doc/CoeffwiseMathFunctionsTable.dox
@@ -0,0 +1,525 @@
+namespace Eigen {
+
+/** \eigenManualPage CoeffwiseMathFunctions Catalog of coefficient-wise math functions
+
+
+<!-- <span style="font-size:300%; color:red; font-weight: 900;">!WORK IN PROGRESS!</span> -->
+
+This table presents a catalog of the coefficient-wise math functions supported by %Eigen.
+In this table, \c a, \c b, refer to Array objects or expressions, and \c m refers to a linear algebra Matrix/Vector object. Standard scalar types are abbreviated as follows:
+  - \c int: \c i32
+  - \c float: \c f
+  - \c double: \c d
+  - \c std::complex<float>: \c cf
+  - \c std::complex<double>: \c cd
+
+For each row, the first column list the equivalent calls for arrays, and matrices when supported. Of course, all functions are available for matrices by first casting it as an array: \c m.array().
+
+The third column gives some hints in the underlying scalar implementation. In most cases, %Eigen does not implement itself the math function but relies on the STL for standard scalar types, or user-provided functions for custom scalar types.
+For instance, some simply calls the respective function of the STL while preserving <a href="http://en.cppreference.com/w/cpp/language/adl">argument-dependent lookup</a> for custom types.
+The following:
+\code
+using std::foo;
+foo(a[i]);
+\endcode
+means that the STL's function \c std::foo will be potentially called if it is compatible with the underlying scalar type. If not, then the user must ensure that an overload of the function foo is available for the given scalar type (usually defined in the same namespace as the given scalar type).
+This also means that, unless specified, if the function \c std::foo is available only in some recent c++ versions (e.g., c++11), then the respective %Eigen's function/method will be usable on standard types only if the compiler support the required c++ version.
+
+<table class="manual-hl">
+<tr>
+<th>API</th><th>Description</th><th>Default scalar implementation</th><th>SIMD</th>
+</tr>
+<tr><td colspan="4"></td></tr>
+<tr><th colspan="4">Basic operations</th></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs
+  a.\link ArrayBase::abs abs\endlink(); \n
+  \link Eigen::abs abs\endlink(a); \n
+  m.\link MatrixBase::cwiseAbs cwiseAbs\endlink();
+  </td>
+  <td>absolute value (\f$ |a_i| \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/fabs">std::abs</a>; \n
+  abs(a[i]);
+  </td>
+  <td>SSE2, AVX (i32,f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_inverse
+  a.\link ArrayBase::inverse inverse\endlink(); \n
+  \link Eigen::inverse inverse\endlink(a); \n
+  m.\link MatrixBase::cwiseInverse cwiseInverse\endlink();
+  </td>
+  <td>inverse value (\f$ 1/a_i \f$) </td>
+  <td class="code">
+  1/a[i];
+  </td>
+  <td>All engines (f,d,fc,fd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_conj
+  a.\link ArrayBase::conjugate conjugate\endlink(); \n
+  \link Eigen::conj conj\endlink(a); \n
+  m.\link MatrixBase::conjugate conjugate();
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Complex_conjugate">complex conjugate</a> (\f$ \bar{a_i} \f$),\n
+  no-op for real </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/conj">std::conj</a>; \n
+  conj(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
+</tr>
+<tr>
+<th colspan="4">Exponential functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_exp
+  a.\link ArrayBase::exp exp\endlink(); \n
+  \link Eigen::exp exp\endlink(a);
+  </td>
+  <td>\f$ e \f$ raised to the given power (\f$ e^{a_i} \f$) </td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/exp">std::exp</a>; \n
+  exp(a[i]);
+  </td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log
+  a.\link ArrayBase::log log\endlink(); \n
+  \link Eigen::log log\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm (\f$ \ln({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log">std::log</a>; \n
+  log(a[i]);
+  </td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log1p
+  a.\link ArrayBase::log1p log1p\endlink(); \n
+  \link Eigen::log1p log1p\endlink(a);
+  </td>
+  <td>natural (base \f$ e \f$) logarithm of 1 plus \n the given number (\f$ \ln({1+a_i}) \f$)</td>
+  <td>built-in generic implementation based on \c log,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/log1p">\c std::log1p </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_log10
+  a.\link ArrayBase::log10 log10\endlink(); \n
+  \link Eigen::log10 log10\endlink(a);
+  </td>
+  <td>base 10 logarithm (\f$ \log_{10}({a_i}) \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/log10">std::log10</a>; \n
+  log10(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Power functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_pow
+  a.\link ArrayBase::pow pow\endlink(b); \n
+  \link Eigen::pow pow\endlink(a,b);
+  </td>
+  <td>raises a number to the given power (\f$ a_i ^ {b_i} \f$) \n \c a and \c b can be either an array or scalar.</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/pow">std::pow</a>; \n
+  pow(a[i],b[i]);\n
+  (plus builtin for integer types)</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sqrt
+  a.\link ArrayBase::sqrt sqrt\endlink(); \n
+  \link Eigen::sqrt sqrt\endlink(a);\n
+  m.\link MatrixBase::cwiseSqrt cwiseSqrt\endlink();
+  </td>
+  <td>computes square root (\f$ \sqrt a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  sqrt(a[i]);</td>
+  <td>SSE2, AVX (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rsqrt
+  a.\link ArrayBase::rsqrt rsqrt\endlink(); \n
+  \link Eigen::rsqrt rsqrt\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Fast_inverse_square_root">reciprocal square root</a> (\f$ 1/{\sqrt a_i} \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sqrt">std::sqrt</a>; \n
+  1/sqrt(a[i]); \n
+  </td>
+  <td>SSE2, AVX, AltiVec, ZVector (f,d)\n
+  (approx + 1 Newton iteration)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_square
+  a.\link ArrayBase::square square\endlink(); \n
+  \link Eigen::square square\endlink(a);
+  </td>
+  <td>computes square power (\f$ a_i^2 \f$)</td>
+  <td class="code">
+  a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cube
+  a.\link ArrayBase::cube cube\endlink(); \n
+  \link Eigen::cube cube\endlink(a);
+  </td>
+  <td>computes cubic power (\f$ a_i^3 \f$)</td>
+  <td class="code">
+  a[i]*a[i]*a[i]</td>
+  <td>All (i32,f,d,cf,cd)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_abs2
+  a.\link ArrayBase::abs2 abs2\endlink(); \n
+  \link Eigen::abs2 abs2\endlink(a);\n
+  m.\link MatrixBase::cwiseAbs2 cwiseAbs2\endlink();
+  </td>
+  <td>computes the squared absolute value (\f$ |a_i|^2 \f$)</td>
+  <td class="code">
+  real:    a[i]*a[i] \n
+  complex:  real(a[i])*real(a[i]) \n
+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; + imag(a[i])*imag(a[i])</td>
+  <td>All (i32,f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Trigonometric functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sin
+  a.\link ArrayBase::sin sin\endlink(); \n
+  \link Eigen::sin sin\endlink(a);
+  </td>
+  <td>computes sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sin">std::sin</a>; \n
+  sin(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cos
+  a.\link ArrayBase::cos cos\endlink(); \n
+  \link Eigen::cos cos\endlink(a);
+  </td>
+  <td>computes cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cos">std::cos</a>; \n
+  cos(a[i]);</td>
+  <td>SSE2, AVX (f)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tan
+  a.\link ArrayBase::tan tan\endlink(); \n
+  \link Eigen::tan tan\endlink(a);
+  </td>
+  <td>computes tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tan">std::tan</a>; \n
+  tan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asin
+  a.\link ArrayBase::asin asin\endlink(); \n
+  \link Eigen::asin asin\endlink(a);
+  </td>
+  <td>computes arc sine (\f$ \sin^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asin">std::asin</a>; \n
+  asin(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acos
+  a.\link ArrayBase::acos acos\endlink(); \n
+  \link Eigen::acos acos\endlink(a);
+  </td>
+  <td>computes arc cosine  (\f$ \cos^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acos">std::acos</a>; \n
+  acos(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atan
+  a.\link ArrayBase::atan tan\endlink(); \n
+  \link Eigen::atan atan\endlink(a);
+  </td>
+  <td>computes arc tangent (\f$ \tan^{-1} a_i \f$)</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atan">std::atan</a>; \n
+  atan(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Hyperbolic functions</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_sinh
+  a.\link ArrayBase::sinh sinh\endlink(); \n
+  \link Eigen::sinh sinh\endlink(a);
+  </td>
+  <td>computes hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/sinh">std::sinh</a>; \n
+  sinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_cosh
+  a.\link ArrayBase::cosh cohs\endlink(); \n
+  \link Eigen::cosh cosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/cosh">std::cosh</a>; \n
+  cosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_tanh
+  a.\link ArrayBase::tanh tanh\endlink(); \n
+  \link Eigen::tanh tanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/tanh">std::tanh</a>; \n
+  tanh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Nearest integer floating point operations</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ceil
+  a.\link ArrayBase::ceil ceil\endlink(); \n
+  \link Eigen::ceil ceil\endlink(a);
+  </td>
+  <td>nearest integer not less than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/ceil">std::ceil</a>; \n
+  ceil(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_floor
+  a.\link ArrayBase::floor floor\endlink(); \n
+  \link Eigen::floor floor\endlink(a);
+  </td>
+  <td>nearest integer not greater than the given value</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/floor">std::floor</a>; \n
+  floor(a[i]);</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_round
+  a.\link ArrayBase::round round\endlink(); \n
+  \link Eigen::round round\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding away from zero in halfway cases</td>
+  <td>built-in generic implementation \n based on \c floor and \c ceil,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/round">\c std::round </a>; \cpp11</td>
+  <td>SSE4,AVX,ZVector (f,d)</td>
+</tr>
+<tr>
+<th colspan="4">Floating point manipulation functions</th>
+</tr>
+<tr>
+<th colspan="4">Classification and comparison</th>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isfinite
+  a.\link ArrayBase::isfinite isfinite\endlink(); \n
+  \link Eigen::isfinite isfinite\endlink(a);
+  </td>
+  <td>checks if the given number has finite value</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isfinite">\c std::isfinite </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isinf
+  a.\link ArrayBase::isinf isinf\endlink(); \n
+  \link Eigen::isinf isinf\endlink(a);
+  </td>
+  <td>checks if the given number is infinite</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isinf">\c std::isinf </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_isnan
+  a.\link ArrayBase::isnan isnan\endlink(); \n
+  \link Eigen::isnan isnan\endlink(a);
+  </td>
+  <td>checks if the given number is not a number</td>
+  <td>built-in generic implementation,\n
+  plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/isnan">\c std::isnan </a>; \cpp11</td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Error and gamma functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erf
+  a.\link ArrayBase::erf erf\endlink(); \n
+  \link Eigen::erf erf\endlink(a);
+  </td>
+  <td>error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erf">std::erf</a>; \cpp11 \n
+  erf(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_erfc
+  a.\link ArrayBase::erfc erfc\endlink(); \n
+  \link Eigen::erfc erfc\endlink(a);
+  </td>
+  <td>complementary error function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/erfc">std::erfc</a>; \cpp11 \n
+  erfc(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_lgamma
+  a.\link ArrayBase::lgamma lgamma\endlink(); \n
+  \link Eigen::lgamma lgamma\endlink(a);
+  </td>
+  <td>natural logarithm of the gamma function</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/lgamma">std::lgamma</a>; \cpp11 \n
+  lgamma(a[i]);
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_digamma
+  a.\link ArrayBase::digamma digamma\endlink(); \n
+  \link Eigen::digamma digamma\endlink(a);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Digamma_function">logarithmic derivative of the gamma function</a></td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igamma
+  \link Eigen::igamma igamma\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">lower incomplete gamma integral</a>
+  \n \f$ \gamma(a_i,x_i)= \frac{1}{|a_i|} \int_{0}^{x_i}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_igammac
+  \link Eigen::igammac igammac\endlink(a,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Incomplete_gamma_function">upper incomplete gamma integral</a>
+  \n \f$ \Gamma(a_i,x_i) = \frac{1}{|a_i|} \int_{x_i}^{\infty}e^{\text{-}t} t^{a_i-1} \mathrm{d} t \f$</td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+<th colspan="4">Special functions</th>
+</tr>
+<tr> <td colspan="4">  Require \c #include \c <unsupported/Eigen/SpecialFunctions> </td></tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_polygamma
+  \link Eigen::polygamma polygamma\endlink(n,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Polygamma_function">n-th derivative of digamma at x</a></td>
+  <td>
+  built-in generic based on\n <a href="#cwisetable_lgamma">\c lgamma </a>,
+  <a href="#cwisetable_digamma"> \c digamma </a>
+  and <a href="#cwisetable_zeta">\c zeta </a>.
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_betainc
+  \link Eigen::betainc betainc\endlink(a,b,x);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function">Incomplete beta function</a></td>
+  <td>
+  built-in for float and double,\n but requires \cpp11
+  </td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_zeta
+  \link Eigen::zeta zeta\endlink(a,b);
+  </td>
+  <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a>
+  \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
+<tr><td colspan="4"></td></tr>
+</table>
+
+\n
+
+*/
+
+}
\ No newline at end of file
diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox
deleted file mode 100644
index cb25f4ec9..000000000
--- a/doc/CustomizingEigen.dox
+++ /dev/null
@@ -1,226 +0,0 @@
-namespace Eigen {
-
-/** \page TopicCustomizingEigen Customizing/Extending Eigen
-
-Eigen can be extended in several ways, for instance, by defining global methods, \ref ExtendingMatrixBase "by adding custom methods to MatrixBase", adding support to \ref CustomScalarType "custom types" etc.
-
-\eigenAutoToc
-
-\section ExtendingMatrixBase Extending MatrixBase (and other classes)
-
-In this section we will see how to add custom methods to MatrixBase. Since all expressions and matrix types inherit MatrixBase, adding a method to MatrixBase make it immediately available to all expressions ! A typical use case is, for instance, to make Eigen compatible with another API.
-
-You certainly know that in C++ it is not possible to add methods to an existing class. So how that's possible ? Here the trick is to include in the declaration of MatrixBase a file defined by the preprocessor token \c EIGEN_MATRIXBASE_PLUGIN:
-\code
-class MatrixBase {
-  // ...
-  #ifdef EIGEN_MATRIXBASE_PLUGIN
-  #include EIGEN_MATRIXBASE_PLUGIN
-  #endif
-};
-\endcode
-Therefore to extend MatrixBase with your own methods you just have to create a file with your method declaration and define EIGEN_MATRIXBASE_PLUGIN before you include any Eigen's header file.
-
-You can extend many of the other classes used in Eigen by defining similarly named preprocessor symbols. For instance, define \c EIGEN_ARRAYBASE_PLUGIN if you want to extend the ArrayBase class. A full list of classes that can be extended in this way and the corresponding preprocessor symbols can be found on our page \ref TopicPreprocessorDirectives.
-
-Here is an example of an extension file for adding methods to MatrixBase: \n
-\b MatrixBaseAddons.h
-\code
-inline Scalar at(uint i, uint j) const { return this->operator()(i,j); }
-inline Scalar& at(uint i, uint j) { return this->operator()(i,j); }
-inline Scalar at(uint i) const { return this->operator[](i); }
-inline Scalar& at(uint i) { return this->operator[](i); }
-
-inline RealScalar squaredLength() const { return squaredNorm(); }
-inline RealScalar length() const { return norm(); }
-inline RealScalar invLength(void) const { return fast_inv_sqrt(squaredNorm()); }
-
-template<typename OtherDerived>
-inline Scalar squaredDistanceTo(const MatrixBase<OtherDerived>& other) const
-{ return (derived() - other.derived()).squaredNorm(); }
-
-template<typename OtherDerived>
-inline RealScalar distanceTo(const MatrixBase<OtherDerived>& other) const
-{ return internal::sqrt(derived().squaredDistanceTo(other)); }
-
-inline void scaleTo(RealScalar l) { RealScalar vl = norm(); if (vl>1e-9) derived() *= (l/vl); }
-
-inline Transpose<Derived> transposed() {return this->transpose();}
-inline const Transpose<Derived> transposed() const {return this->transpose();}
-
-inline uint minComponentId(void) const  { int i; this->minCoeff(&i); return i; }
-inline uint maxComponentId(void) const  { int i; this->maxCoeff(&i); return i; }
-
-template<typename OtherDerived>
-void makeFloor(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMin(other.derived()); }
-template<typename OtherDerived>
-void makeCeil(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMax(other.derived()); }
-
-const CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>
-operator+(const Scalar& scalar) const
-{ return CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>(derived(), internal::scalar_add_op<Scalar>(scalar)); }
-
-friend const CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>
-operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
-{ return CwiseUnaryOp<internal::scalar_add_op<Scalar>, Derived>(mat.derived(), internal::scalar_add_op<Scalar>(scalar)); }
-\endcode
-
-Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
-\code
-#define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
-\endcode
-
-\section InheritingFromMatrix Inheriting from Matrix
-
-Before inheriting from Matrix, be really, I mean REALLY, sure that using
-EIGEN_MATRIX_PLUGIN is not what you really want (see previous section).
-If you just need to add few members to Matrix, this is the way to go.
-
-An example of when you actually need to inherit Matrix, is when you
-have several layers of heritage such as 
-MyVerySpecificVector1, MyVerySpecificVector2 -> MyVector1 -> Matrix and
-MyVerySpecificVector3, MyVerySpecificVector4 -> MyVector2 -> Matrix.
-
-In order for your object to work within the %Eigen framework, you need to
-define a few members in your inherited class.
-
-Here is a minimalistic example:
-
-\include CustomizingEigen_Inheritance.cpp
-
-Output: \verbinclude CustomizingEigen_Inheritance.out
-
-This is the kind of error you can get if you don't provide those methods
-\verbatim
-error: no match for ‘operator=’ in ‘v = Eigen::operator*(
-const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1, 0, -0x000000001, 1> >::Scalar&, 
-const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
-(((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
-((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType*)(& v))))’
-\endverbatim
-
-\anchor user_defined_scalars \section CustomScalarType Using custom scalar types
-
-By default, Eigen currently supports standard floating-point types (\c float, \c double, \c std::complex<float>, \c std::complex<double>, \c long \c double), as well as all native integer types (e.g., \c int, \c unsigned \c int, \c short, etc.), and \c bool.
-On x86-64 systems, \c long \c double permits to locally enforces the use of x87 registers with extended accuracy (in comparison to SSE).
-
-In order to add support for a custom type \c T you need:
--# make sure the common operator (+,-,*,/,etc.) are supported by the type \c T
--# add a specialization of struct Eigen::NumTraits<T> (see \ref NumTraits)
--# define the math functions that makes sense for your type. This includes standard ones like sqrt, pow, sin, tan, conj, real, imag, etc, as well as abs2 which is Eigen specific.
-     (see the file Eigen/src/Core/MathFunctions.h)
-
-The math function should be defined in the same namespace than \c T, or in the \c std namespace though that second approach is not recommended.
-
-Here is a concrete example adding support for the Adolc's \c adouble type. <a href="https://projects.coin-or.org/ADOL-C">Adolc</a> is an automatic differentiation library. The type \c adouble is basically a real value tracking the values of any number of partial derivatives.
-
-\code
-#ifndef ADOLCSUPPORT_H
-#define ADOLCSUPPORT_H
-
-#define ADOLC_TAPELESS
-#include <adolc/adouble.h>
-#include <Eigen/Core>
-
-namespace Eigen {
-
-template<> struct NumTraits<adtl::adouble>
- : NumTraits<double> // permits to get the epsilon, dummy_precision, lowest, highest functions
-{
-  typedef adtl::adouble Real;
-  typedef adtl::adouble NonInteger;
-  typedef adtl::adouble Nested;
-
-  enum {
-    IsComplex = 0,
-    IsInteger = 0,
-    IsSigned = 1,
-    RequireInitialization = 1,
-    ReadCost = 1,
-    AddCost = 3,
-    MulCost = 3
-  };
-};
-
-}
-
-namespace adtl {
-
-inline const adouble& conj(const adouble& x)  { return x; }
-inline const adouble& real(const adouble& x)  { return x; }
-inline adouble imag(const adouble&)    { return 0.; }
-inline adouble abs(const adouble&  x)  { return fabs(x); }
-inline adouble abs2(const adouble& x)  { return x*x; }
-
-}
-
-#endif // ADOLCSUPPORT_H
-\endcode
-
-This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
-
-\code
-#include <gmpxx.h>
-#include <Eigen/Core>
-#include <boost/operators.hpp>
-
-namespace Eigen {
-  template<class> struct NumTraits;
-  template<> struct NumTraits<mpq_class>
-  {
-    typedef mpq_class Real;
-    typedef mpq_class NonInteger;
-    typedef mpq_class Nested;
-
-    static inline Real epsilon() { return 0; }
-    static inline Real dummy_precision() { return 0; }
-
-    enum {
-      IsInteger = 0,
-      IsSigned = 1,
-      IsComplex = 0,
-      RequireInitialization = 1,
-      ReadCost = 6,
-      AddCost = 150,
-      MulCost = 100
-    };
-  };
-
-  namespace internal {
-    template<>
-      struct significant_decimals_impl<mpq_class>
-      {
-	// Infinite precision when printing
-	static inline int run() { return 0; }
-      };
-
-    template<> struct scalar_score_coeff_op<mpq_class> {
-      struct result_type : boost::totally_ordered1<result_type> {
-	std::size_t len;
-	result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
-	result_type(mpq_class const& q) :
-	  len(mpz_size(q.get_num_mpz_t())+
-	      mpz_size(q.get_den_mpz_t())-1) {}
-	friend bool operator<(result_type x, result_type y) {
-	  // 0 is the worst possible pivot
-	  if (x.len == 0) return y.len > 0;
-	  if (y.len == 0) return false;
-	  // Prefer a pivot with a small representation
-	  return x.len > y.len;
-	}
-	friend bool operator==(result_type x, result_type y) {
-	  // Only used to test if the score is 0
-	  return x.len == y.len;
-	}
-      };
-      result_type operator()(mpq_class const& x) const { return x; }
-    };
-  }
-}
-\endcode
-
-\sa \ref TopicPreprocessorDirectives
-
-*/
-
-}
diff --git a/doc/CustomizingEigen_CustomScalar.dox b/doc/CustomizingEigen_CustomScalar.dox
new file mode 100644
index 000000000..1ee78cbe5
--- /dev/null
+++ b/doc/CustomizingEigen_CustomScalar.dox
@@ -0,0 +1,120 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_CustomScalar Using custom scalar types
+\anchor user_defined_scalars
+
+By default, Eigen currently supports standard floating-point types (\c float, \c double, \c std::complex<float>, \c std::complex<double>, \c long \c double), as well as all native integer types (e.g., \c int, \c unsigned \c int, \c short, etc.), and \c bool.
+On x86-64 systems, \c long \c double permits to locally enforces the use of x87 registers with extended accuracy (in comparison to SSE).
+
+In order to add support for a custom type \c T you need:
+-# make sure the common operator (+,-,*,/,etc.) are supported by the type \c T
+-# add a specialization of struct Eigen::NumTraits<T> (see \ref NumTraits)
+-# define the math functions that makes sense for your type. This includes standard ones like sqrt, pow, sin, tan, conj, real, imag, etc, as well as abs2 which is Eigen specific.
+     (see the file Eigen/src/Core/MathFunctions.h)
+
+The math function should be defined in the same namespace than \c T, or in the \c std namespace though that second approach is not recommended.
+
+Here is a concrete example adding support for the Adolc's \c adouble type. <a href="https://projects.coin-or.org/ADOL-C">Adolc</a> is an automatic differentiation library. The type \c adouble is basically a real value tracking the values of any number of partial derivatives.
+
+\code
+#ifndef ADOLCSUPPORT_H
+#define ADOLCSUPPORT_H
+
+#define ADOLC_TAPELESS
+#include <adolc/adouble.h>
+#include <Eigen/Core>
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+ : NumTraits<double> // permits to get the epsilon, dummy_precision, lowest, highest functions
+{
+  typedef adtl::adouble Real;
+  typedef adtl::adouble NonInteger;
+  typedef adtl::adouble Nested;
+
+  enum {
+    IsComplex = 0,
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 1,
+    ReadCost = 1,
+    AddCost = 3,
+    MulCost = 3
+  };
+};
+
+}
+
+namespace adtl {
+
+inline const adouble& conj(const adouble& x)  { return x; }
+inline const adouble& real(const adouble& x)  { return x; }
+inline adouble imag(const adouble&)    { return 0.; }
+inline adouble abs(const adouble&  x)  { return fabs(x); }
+inline adouble abs2(const adouble& x)  { return x*x; }
+
+}
+
+#endif // ADOLCSUPPORT_H
+\endcode
+
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<> struct NumTraits<mpq_class> : GenericNumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+    static inline Real digits10() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+        std::size_t len;
+        result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+        result_type(mpq_class const& q) :
+          len(mpz_size(q.get_num_mpz_t())+
+              mpz_size(q.get_den_mpz_t())-1) {}
+        friend bool operator<(result_type x, result_type y) {
+          // 0 is the worst possible pivot
+          if (x.len == 0) return y.len > 0;
+          if (y.len == 0) return false;
+          // Prefer a pivot with a small representation
+          return x.len > y.len;
+        }
+        friend bool operator==(result_type x, result_type y) {
+          // Only used to test if the score is 0
+          return x.len == y.len;
+        }
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
+
+*/
+
+}
diff --git a/doc/CustomizingEigen_InheritingMatrix.dox b/doc/CustomizingEigen_InheritingMatrix.dox
new file mode 100644
index 000000000..b21e55433
--- /dev/null
+++ b/doc/CustomizingEigen_InheritingMatrix.dox
@@ -0,0 +1,34 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_InheritingMatrix Inheriting from Matrix
+
+Before inheriting from Matrix, be really, I mean REALLY, sure that using
+EIGEN_MATRIX_PLUGIN is not what you really want (see previous section).
+If you just need to add few members to Matrix, this is the way to go.
+
+An example of when you actually need to inherit Matrix, is when you
+have several layers of heritage such as 
+MyVerySpecificVector1, MyVerySpecificVector2 -> MyVector1 -> Matrix and
+MyVerySpecificVector3, MyVerySpecificVector4 -> MyVector2 -> Matrix.
+
+In order for your object to work within the %Eigen framework, you need to
+define a few members in your inherited class.
+
+Here is a minimalistic example:
+
+\include CustomizingEigen_Inheritance.cpp
+
+Output: \verbinclude CustomizingEigen_Inheritance.out
+
+This is the kind of error you can get if you don't provide those methods
+\verbatim
+error: no match for ‘operator=’ in ‘v = Eigen::operator*(
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1, 0, -0x000000001, 1> >::Scalar&, 
+const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+(((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType&)
+((const Eigen::MatrixBase<Eigen::Matrix<double, -0x000000001, 1> >::StorageBaseType*)(& v))))’
+\endverbatim
+
+*/
+
+}
diff --git a/doc/CustomizingEigen_NullaryExpr.dox b/doc/CustomizingEigen_NullaryExpr.dox
new file mode 100644
index 000000000..37c8dcd89
--- /dev/null
+++ b/doc/CustomizingEigen_NullaryExpr.dox
@@ -0,0 +1,86 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_NullaryExpr Matrix manipulation via nullary-expressions
+
+
+The main purpose of the class CwiseNullaryOp is to define \em procedural matrices such as constant or random matrices as returned by the Ones(), Zero(), Constant(), Identity() and Random() methods.
+Nevertheless, with some imagination it is possible to accomplish very sophisticated matrix manipulation with minimal efforts such that \ref TopicNewExpressionType "implementing new expression" is rarely needed.
+
+\section NullaryExpr_Circulant Example 1: circulant matrix
+
+To explore these possibilities let us start with the  \em circulant example of the \ref TopicNewExpressionType "implementing new expression" topic.
+Let us recall that a circulant matrix is a matrix where each column is the same as the
+column to the left, except that it is cyclically shifted downwards.
+For example, here is a 4-by-4 circulant matrix:
+\f[ \begin{bmatrix}
+    1 & 8 & 4 & 2 \\
+    2 & 1 & 8 & 4 \\
+    4 & 2 & 1 & 8 \\
+    8 & 4 & 2 & 1
+\end{bmatrix} \f]
+A circulant matrix is uniquely determined by its first column. We wish
+to write a function \c makeCirculant which, given the first column,
+returns an expression representing the circulant matrix.
+
+For this exercise, the return type of \c makeCirculant will be a CwiseNullaryOp that we need to instantiate with:
+1 - a proper \c circulant_functor storing the input vector and implementing the adequate coefficient accessor \c operator(i,j)
+2 - a template instantiation of class Matrix conveying compile-time information such as the scalar type, sizes, and preferred storage layout.
+
+Calling \c ArgType the type of the input vector, we can construct the equivalent squared Matrix type as follows:
+
+\snippet make_circulant2.cpp square
+
+This little helper structure will help us to implement our \c makeCirculant function as follows:
+
+\snippet make_circulant2.cpp makeCirculant
+
+As usual, our function takes as argument a \c MatrixBase (see this \ref TopicFunctionTakingEigenTypes "page" for more details).
+Then, the CwiseNullaryOp object is constructed through the DenseBase::NullaryExpr static method with the adequate runtime sizes.
+
+Then, we need to implement our \c circulant_functor, which is a straightforward exercise:
+
+\snippet make_circulant2.cpp circulant_func
+
+We are now all set to try our new feature:
+
+\snippet make_circulant2.cpp main
+
+
+If all the fragments are combined, the following output is produced,
+showing that the program works as expected:
+
+\include make_circulant2.out
+
+This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
+
+
+\section NullaryExpr_Indexing Example 2: indexing rows and columns
+
+The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
+
+\snippet nullary_indexing.out main1
+
+To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
+
+\snippet nullary_indexing.cpp functor
+
+Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
+
+\snippet nullary_indexing.cpp function
+
+Finally, here is an example of how this function can be used:
+
+\snippet nullary_indexing.cpp main1
+
+This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
+
+\snippet nullary_indexing.cpp main2
+
+and the output is:
+
+\snippet nullary_indexing.out main2
+
+*/
+
+}
+
diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox
new file mode 100644
index 000000000..d88f2409b
--- /dev/null
+++ b/doc/CustomizingEigen_Plugins.dox
@@ -0,0 +1,69 @@
+namespace Eigen {
+
+/** \page TopicCustomizing_Plugins Extending MatrixBase (and other classes)
+
+In this section we will see how to add custom methods to MatrixBase. Since all expressions and matrix types inherit MatrixBase, adding a method to MatrixBase make it immediately available to all expressions ! A typical use case is, for instance, to make Eigen compatible with another API.
+
+You certainly know that in C++ it is not possible to add methods to an existing class. So how that's possible ? Here the trick is to include in the declaration of MatrixBase a file defined by the preprocessor token \c EIGEN_MATRIXBASE_PLUGIN:
+\code
+class MatrixBase {
+  // ...
+  #ifdef EIGEN_MATRIXBASE_PLUGIN
+  #include EIGEN_MATRIXBASE_PLUGIN
+  #endif
+};
+\endcode
+Therefore to extend MatrixBase with your own methods you just have to create a file with your method declaration and define EIGEN_MATRIXBASE_PLUGIN before you include any Eigen's header file.
+
+You can extend many of the other classes used in Eigen by defining similarly named preprocessor symbols. For instance, define \c EIGEN_ARRAYBASE_PLUGIN if you want to extend the ArrayBase class. A full list of classes that can be extended in this way and the corresponding preprocessor symbols can be found on our page \ref TopicPreprocessorDirectives.
+
+Here is an example of an extension file for adding methods to MatrixBase: \n
+\b MatrixBaseAddons.h
+\code
+inline Scalar at(uint i, uint j) const { return this->operator()(i,j); }
+inline Scalar& at(uint i, uint j) { return this->operator()(i,j); }
+inline Scalar at(uint i) const { return this->operator[](i); }
+inline Scalar& at(uint i) { return this->operator[](i); }
+
+inline RealScalar squaredLength() const { return squaredNorm(); }
+inline RealScalar length() const { return norm(); }
+inline RealScalar invLength(void) const { return fast_inv_sqrt(squaredNorm()); }
+
+template<typename OtherDerived>
+inline Scalar squaredDistanceTo(const MatrixBase<OtherDerived>& other) const
+{ return (derived() - other.derived()).squaredNorm(); }
+
+template<typename OtherDerived>
+inline RealScalar distanceTo(const MatrixBase<OtherDerived>& other) const
+{ return internal::sqrt(derived().squaredDistanceTo(other)); }
+
+inline void scaleTo(RealScalar l) { RealScalar vl = norm(); if (vl>1e-9) derived() *= (l/vl); }
+
+inline Transpose<Derived> transposed() {return this->transpose();}
+inline const Transpose<Derived> transposed() const {return this->transpose();}
+
+inline uint minComponentId(void) const  { int i; this->minCoeff(&i); return i; }
+inline uint maxComponentId(void) const  { int i; this->maxCoeff(&i); return i; }
+
+template<typename OtherDerived>
+void makeFloor(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMin(other.derived()); }
+template<typename OtherDerived>
+void makeCeil(const MatrixBase<OtherDerived>& other) { derived() = derived().cwiseMax(other.derived()); }
+
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>
+operator+(const Scalar& scalar) const
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const ConstantReturnType>(derived(), Constant(rows(),cols(),scalar)); }
+
+friend const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>
+operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
+{ return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); }
+\endcode
+
+Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
+\code
+#define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
+\endcode
+
+*/
+
+}
diff --git a/doc/DenseDecompositionBenchmark.dox b/doc/DenseDecompositionBenchmark.dox
new file mode 100644
index 000000000..7be9c70cd
--- /dev/null
+++ b/doc/DenseDecompositionBenchmark.dox
@@ -0,0 +1,42 @@
+namespace Eigen {
+
+/** \eigenManualPage DenseDecompositionBenchmark Benchmark of dense decompositions
+
+This page presents a speed comparison of the dense matrix decompositions offered by %Eigen for a wide range of square matrices and overconstrained problems.
+
+For a more general overview on the features and numerical robustness of linear solvers and decompositions, check this \link TopicLinearAlgebraDecompositions table \endlink.
+
+This benchmark has been run on a laptop equipped with an Intel core i7 \@ 2,6 GHz, and compiled with clang with \b AVX and \b FMA instruction sets enabled but without multi-threading.
+It uses \b single \b precision \b float numbers. For double, you can get a good estimate by multiplying the timings by a factor 2.
+
+The square matrices are symmetric, and for the overconstrained matrices, the reported timmings include the cost to compute the symmetric covariance matrix \f$ A^T A \f$ for the first four solvers based on Cholesky and LU, as denoted by the \b * symbol (top-right corner part of the table).
+Timings are in \b milliseconds, and factors are relative to the LLT decomposition which is the fastest but also the least general and robust.
+
+<table class="manual">
+<tr><th>solver/size</th>
+  <th>8x8</th>  <th>100x100</th>  <th>1000x1000</th>  <th>4000x4000</th>  <th>10000x8</th>  <th>10000x100</th>  <th>10000x1000</th>  <th>10000x4000</th></tr>
+<tr><td>LLT</td><td>0.05</td><td>0.42</td><td>5.83</td><td>374.55</td><td>6.79 <sup><a href="#note_ls">*</a></sup></td><td>30.15 <sup><a href="#note_ls">*</a></sup></td><td>236.34 <sup><a href="#note_ls">*</a></sup></td><td>3847.17 <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>LDLT</td><td>0.07 (x1.3)</td><td>0.65 (x1.5)</td><td>26.86 (x4.6)</td><td>2361.18 (x6.3)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.91 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>252.61 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>5807.66 (x1.5) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr><td>PartialPivLU</td><td>0.08 (x1.5)</td><td>0.69 (x1.6)</td><td>15.63 (x2.7)</td><td>709.32 (x1.9)</td><td>6.81 (x1) <sup><a href="#note_ls">*</a></sup></td><td>31.32 (x1) <sup><a href="#note_ls">*</a></sup></td><td>241.68 (x1) <sup><a href="#note_ls">*</a></sup></td><td>4270.48 (x1.1) <sup><a href="#note_ls">*</a></sup></td></tr>
+<tr class="alt"><td>FullPivLU</td><td>0.1 (x1.9)</td><td>4.48 (x10.6)</td><td>281.33 (x48.2)</td><td>-</td><td>6.83 (x1) <sup><a href="#note_ls">*</a></sup></td><td>32.67 (x1.1) <sup><a href="#note_ls">*</a></sup></td><td>498.25 (x2.1) <sup><a href="#note_ls">*</a></sup></td><td>-</td></tr>
+<tr><td>HouseholderQR</td><td>0.19 (x3.5)</td><td>2.18 (x5.2)</td><td>23.42 (x4)</td><td>1337.52 (x3.6)</td><td>34.26 (x5)</td><td>129.01 (x4.3)</td><td>377.37 (x1.6)</td><td>4839.1 (x1.3)</td></tr>
+<tr class="alt"><td>ColPivHouseholderQR</td><td>0.23 (x4.3)</td><td>2.23 (x5.3)</td><td>103.34 (x17.7)</td><td>9987.16 (x26.7)</td><td>36.05 (x5.3)</td><td>163.18 (x5.4)</td><td>2354.08 (x10)</td><td>37860.5 (x9.8)</td></tr>
+<tr><td>CompleteOrthogonalDecomposition</td><td>0.23 (x4.3)</td><td>2.22 (x5.2)</td><td>99.44 (x17.1)</td><td>10555.3 (x28.2)</td><td>35.75 (x5.3)</td><td>169.39 (x5.6)</td><td>2150.56 (x9.1)</td><td>36981.8 (x9.6)</td></tr>
+<tr class="alt"><td>FullPivHouseholderQR</td><td>0.23 (x4.3)</td><td>4.64 (x11)</td><td>289.1 (x49.6)</td><td>-</td><td>69.38 (x10.2)</td><td>446.73 (x14.8)</td><td>4852.12 (x20.5)</td><td>-</td></tr>
+<tr><td>JacobiSVD</td><td>1.01 (x18.6)</td><td>71.43 (x168.4)</td><td>-</td><td>-</td><td>113.81 (x16.7)</td><td>1179.66 (x39.1)</td><td>-</td><td>-</td></tr>
+<tr class="alt"><td>BDCSVD</td><td>1.07 (x19.7)</td><td>21.83 (x51.5)</td><td>331.77 (x56.9)</td><td>18587.9 (x49.6)</td><td>110.53 (x16.3)</td><td>397.67 (x13.2)</td><td>2975 (x12.6)</td><td>48593.2 (x12.6)</td></tr>
+</table>
+
+<a name="note_ls">\b *: </a> This decomposition do not support direct least-square solving for over-constrained problems, and the reported timing include the cost to form the symmetric covariance matrix \f$ A^T A \f$.
+
+\b Observations:
+ + LLT is always the fastest solvers.
+ + For largely over-constrained problems, the cost of Cholesky/LU decompositions is dominated by the computation of the symmetric covariance matrix.
+ + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
+ + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
+
+The above table has been generated by the <a href="https://bitbucket.org/eigen/eigen/raw/default/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+
+*/
+
+}
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 0a43c7c4e..e9b116d28 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -125,7 +125,7 @@ ALWAYS_DETAILED_SEC    = NO
 # members were ordinary class members. Constructors, destructors and assignment
 # operators of the base classes will not be shown.
 
-INLINE_INHERITED_MEMB  = YES
+INLINE_INHERITED_MEMB  = NO
 
 # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
 # path before files name in the file list and in the header files. If set
@@ -216,6 +216,7 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "lu_module=This is defined in the %LU module. \code #include <Eigen/LU> \endcode" \
                          "qr_module=This is defined in the %QR module. \code #include <Eigen/QR> \endcode" \
                          "svd_module=This is defined in the %SVD module. \code #include <Eigen/SVD> \endcode" \
+                         "specialfunctions_module=This is defined in the \b unsupported SpecialFunctions module. \code #include <Eigen/SpecialFunctions> \endcode" \
                          "label=\bug" \
                          "matrixworld=<a href='#matrixonly' style='color:green;text-decoration: none;'>*</a>" \
                          "arrayworld=<a href='#arrayonly' style='color:blue;text-decoration: none;'>*</a>" \
@@ -225,7 +226,10 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \
                          "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" \
                          "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink." \
-                         "blank= "
+                         "blank= " \
+                         "cpp11=<span class='cpp11'>[c++11]</span>" \
+                         "cpp14=<span class='cpp14'>[c++14]</span>" \
+                         "cpp17=<span class='cpp17'>[c++17]</span>"
                          
 
 ALIASES += "eigenAutoToc=  "
@@ -1587,7 +1591,8 @@ PREDEFINED             = EIGEN_EMPTY_STRUCT \
                          EIGEN_STRONG_INLINE=inline \
                          EIGEN_DEVICE_FUNC= \
                          "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template<typename OtherDerived> const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const;" \
-                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<typename LHS::Scalar, typename RHS::Scalar >, const LHS, const RHS>"
+                         "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<typename LHS::Scalar, typename RHS::Scalar >, const LHS, const RHS>"\
+                         DOXCOMMA=,
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
 # this tag can be used to specify a list of macro names that should be expanded.
@@ -1602,7 +1607,15 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                          EIGEN_CWISE_BINOP_RETURN_TYPE \
                          EIGEN_CURRENT_STORAGE_BASE_CLASS \
                          EIGEN_MATHFUNC_IMPL \
-                         _EIGEN_GENERIC_PUBLIC_INTERFACE
+                         _EIGEN_GENERIC_PUBLIC_INTERFACE \
+                         EIGEN_ARRAY_DECLARE_GLOBAL_UNARY \
+                         EIGEN_EMPTY \
+                         EIGEN_EULER_ANGLES_TYPEDEFS \
+                         EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
+                         EIGEN_DOC_UNARY_ADDONS \
+                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
+                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros
diff --git a/doc/InplaceDecomposition.dox b/doc/InplaceDecomposition.dox
new file mode 100644
index 000000000..cb1c6d413
--- /dev/null
+++ b/doc/InplaceDecomposition.dox
@@ -0,0 +1,115 @@
+namespace Eigen {
+
+/** \eigenManualPage InplaceDecomposition Inplace matrix decompositions
+
+Starting from %Eigen 3.3, the LU, Cholesky, and QR decompositions can operate \em inplace, that is, directly within the given input matrix.
+This feature is especially useful when dealing with huge matrices, and or when the available memory is very limited (embedded systems).
+
+To this end, the respective decomposition class must be instantiated with a Ref<> matrix type, and the decomposition object must be constructed with the input matrix as argument. As an example, let us consider an inplace LU decomposition with partial pivoting.
+
+Let's start with the basic inclusions, and declaration of a 2x2 matrix \c A:
+
+<table class="example">
+<tr><th>code</th><th>output</th></tr>
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp init
+  </td>
+  <td>\snippet TutorialInplaceLU.out init
+  </td>
+</tr>
+</table>
+
+No surprise here! Then, let's declare our inplace LU object \c lu, and check the content of the matrix \c A:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp declaration
+  </td>
+  <td>\snippet TutorialInplaceLU.out declaration
+  </td>
+</tr>
+</table>
+
+Here, the \c lu object computes and stores the \c L and \c U factors within the memory held by the matrix \c A.
+The coefficients of \c A have thus been destroyed during the factorization, and replaced by the L and U factors as one can verify:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp matrixLU
+  </td>
+  <td>\snippet TutorialInplaceLU.out matrixLU
+  </td>
+</tr>
+</table>
+
+Then, one can use the \c lu object as usual, for instance to solve the Ax=b problem:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp solve
+  </td>
+  <td>\snippet TutorialInplaceLU.out solve
+  </td>
+</tr>
+</table>
+
+Here, since the content of the original matrix \c A has been lost, we had to declared a new matrix \c A0 to verify the result.
+
+Since the memory is shared between \c A and \c lu, modifying the matrix \c A will make \c lu invalid.
+This can easily be verified by modifying the content of \c A and trying to solve the initial problem again:
+
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp modifyA
+  </td>
+  <td>\snippet TutorialInplaceLU.out modifyA
+  </td>
+</tr>
+</table>
+
+Note that there is no shared pointer under the hood, it is the \b responsibility \b of \b the \b user to keep the input matrix \c A in life as long as \c lu is living.
+
+If one wants to update the factorization with the modified A, one has to call the compute method as usual:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute
+  </td>
+</tr>
+</table>
+
+Note that calling compute does not change the memory which is referenced by the \c lu object. Therefore, if the compute method is called with another matrix \c A1 different than \c A, then the content of \c A1 won't be modified. This is still the content of \c A that will be used to store the L and U factors of the matrix \c A1.
+This can easily be verified as follows:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis0
+ </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis0
+ </td>
+</tr>
+</table>
+The matrix \c A1 is unchanged, and one can thus solve A1*x=b, and directly check the residual without any copy of \c A1:
+<table class="example">
+<tr>
+  <td>\snippet TutorialInplaceLU.cpp recompute_bis1
+  </td>
+  <td>\snippet TutorialInplaceLU.out recompute_bis1
+ </td>
+</tr>
+</table>
+
+
+Here is the list of matrix decompositions supporting this inplace mechanism:
+
+- class LLT
+- class LDLT
+- class PartialPivLU
+- class FullPivLU
+- class HouseholderQR
+- class ColPivHouseholderQR
+- class FullPivHouseholderQR
+- class CompleteOrthogonalDecomposition
+
+*/
+
+}
\ No newline at end of file
diff --git a/doc/Manual.dox b/doc/Manual.dox
index 70aaa9a42..a08609ad7 100644
--- a/doc/Manual.dox
+++ b/doc/Manual.dox
@@ -3,21 +3,31 @@
 
 namespace Eigen {
 
+/** \page UserManual_CustomizingEigen Extending/Customizing Eigen
+  %Eigen can be extended in several ways, for instance, by defining global methods, by inserting custom methods within main %Eigen's classes through the \ref TopicCustomizing_Plugins "plugin" mechanism, by adding support to \ref TopicCustomizing_CustomScalar "custom scalar types" etc. See below for the respective sub-topics.
+  - \subpage TopicCustomizing_Plugins
+  - \subpage TopicCustomizing_InheritingMatrix
+  - \subpage TopicCustomizing_CustomScalar
+  - \subpage TopicCustomizing_NullaryExpr
+  - \subpage TopicNewExpressionType
+  \sa \ref TopicPreprocessorDirectives
+*/
+
+
 /** \page UserManual_Generalities General topics
   - \subpage Eigen2ToEigen3
   - \subpage TopicFunctionTakingEigenTypes
   - \subpage TopicPreprocessorDirectives
   - \subpage TopicAssertions
-  - \subpage TopicCustomizingEigen
   - \subpage TopicMultiThreading
+  - \subpage TopicUsingBlasLapack
   - \subpage TopicUsingIntelMKL
   - \subpage TopicCUDA
   - \subpage TopicPitfalls
   - \subpage TopicTemplateKeyword
-  - \subpage TopicNewExpressionType
   - \subpage UserManual_UnderstandingEigen
 */
-  
+
 /** \page UserManual_UnderstandingEigen Understanding Eigen
   - \subpage TopicInsideEigenExample
   - \subpage TopicClassHierarchy
@@ -90,6 +100,9 @@ namespace Eigen {
 /** \addtogroup Householder_Module
     \ingroup DenseMatrixManipulation_Reference */ 
 
+/** \addtogroup CoeffwiseMathFunctions
+    \ingroup DenseMatrixManipulation_chapter */
+
 /** \addtogroup QuickRefPage
     \ingroup DenseMatrixManipulation_chapter */
 
@@ -103,6 +116,10 @@ namespace Eigen {
     \ingroup DenseLinearSolvers_chapter */
 /** \addtogroup LeastSquares 
     \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup InplaceDecomposition
+    \ingroup DenseLinearSolvers_chapter */
+/** \addtogroup DenseDecompositionBenchmark
+    \ingroup DenseLinearSolvers_chapter */
 
 /** \addtogroup DenseLinearSolvers_Reference
     \ingroup DenseLinearSolvers_chapter */
diff --git a/doc/MatrixfreeSolverExample.dox b/doc/MatrixfreeSolverExample.dox
index 000cb0bbe..3efa292b5 100644
--- a/doc/MatrixfreeSolverExample.dox
+++ b/doc/MatrixfreeSolverExample.dox
@@ -6,12 +6,12 @@ namespace Eigen {
 \eigenManualPage MatrixfreeSolverExample Matrix-free solvers
 
 Iterative solvers such as ConjugateGradient and BiCGSTAB can be used in a matrix free context. To this end, user must provide a wrapper class inheriting EigenBase<> and implementing the following methods:
- - Index rows() and Index cols(): returns number of rows and columns respectively
- - operator* with and %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
+ - \c Index \c rows() and \c Index \c cols(): returns number of rows and columns respectively
+ - \c operator* with your type and an %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
 
-Eigen::internal::traits<> must also be specialized for the wrapper type.
+\c Eigen::internal::traits<> must also be specialized for the wrapper type.
 
-Here is a complete example wrapping a Eigen::SparseMatrix:
+Here is a complete example wrapping an Eigen::SparseMatrix:
 \include matrixfree_cg.cpp
 Output: \verbinclude matrixfree_cg.out
 
diff --git a/doc/NewExpressionType.dox b/doc/NewExpressionType.dox
index ad8b7f86b..c2f243312 100644
--- a/doc/NewExpressionType.dox
+++ b/doc/NewExpressionType.dox
@@ -2,6 +2,12 @@ namespace Eigen {
 
 /** \page TopicNewExpressionType Adding a new expression type
 
+<!--<span style="font-size:130%; color:red; font-weight: 900;"></span>-->
+\warning
+Disclaimer: this page is tailored to very advanced users who are not afraid of dealing with some %Eigen's internal aspects.
+In most cases, a custom expression can be avoided by either using custom \ref MatrixBase::unaryExpr "unary" or \ref MatrixBase::binaryExpr "binary" functors,
+while extremely complex matrix manipulations can be achieved by a nullary functors as described in the \ref TopicCustomizing_NullaryExpr "previous page".
+
 This page describes with the help of an example how to implement a new
 light-weight expression type in %Eigen. This consists of three parts:
 the expression type itself, a traits class containing compile-time
@@ -130,7 +136,7 @@ function can be called.
 If all the fragments are combined, the following output is produced,
 showing that the program works as expected:
 
-\verbinclude make_circulant.out
+\include make_circulant.out
 
 */
 }
diff --git a/doc/Overview.dox b/doc/Overview.dox
index 9ab96233a..dbb49bd21 100644
--- a/doc/Overview.dox
+++ b/doc/Overview.dox
@@ -17,7 +17,9 @@ You're a MatLab user? There is also a <a href="AsciiQuickReference.txt">short AS
 The \b main \b documentation is organized into \em chapters covering different domains of features.
 They are themselves composed of \em user \em manual pages describing the different features in a comprehensive way, and \em reference pages that gives you access to the API documentation through the related Eigen's \em modules and \em classes.
 
-Under the  \subpage UserManual_Generalities section, you will find documentation on more general topics such as preprocessor directives, controlling assertions, multi-threading, MKL support, some Eigen's internal insights, and much more...
+Under the \subpage UserManual_CustomizingEigen section, you will find discussions and examples on extending %Eigen's features and supporting custom scalar types.
+
+Under the \subpage UserManual_Generalities section, you will find documentation on more general topics such as preprocessor directives, controlling assertions, multi-threading, MKL support, some Eigen's internal insights, and much more...
 
 Finally, do not miss the search engine, useful to quickly get to the documentation of a given class or function.
 
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index 14e84bc20..2f9c4c370 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -49,6 +49,36 @@ are doing.
    the correct size. Not defined by default.
 
 
+\section TopicPreprocessorDirectivesCppVersion C++ standard features
+
+By default, %Eigen strive to automatically detect and enable langage features at compile-time based on
+the information provided by the compiler.
+
+ - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER.
+   Possible values are: 03, 11, 14, 17, etc. If not defined (the default), %Eigen enables all features supported
+   by the compiler.
+
+Individual features can be explicitly enabled or disabled by defining the following token to 0 or 1 respectively.
+For instance, one might limit the C++ version to C++03 by defining EIGEN_MAX_CPP_VER=03, but still enable C99 math
+functions by defining EIGEN_HAS_C99_MATH=1.
+
+ - \b EIGEN_HAS_C99_MATH - controls the usage of C99 math functions such as erf, erfc, lgamma, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc.
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_RVALUE_REFERENCES - defines whetehr rvalue references are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_VARIADIC_TEMPLATES - defines whether variadic templates are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CONSTEXPR - defines whether relaxed const expression are supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<14.
+ - \b EIGEN_HAS_CXX11_CONTAINERS - defines whether STL's containers follows C++11 specifications
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_HAS_CXX11_NOEXCEPT - defines whether noexcept is supported
+   Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+
 \section TopicPreprocessorDirectivesAssertions Assertions
 
 The %Eigen library contains many assertions to guard against programming errors, both at compile time and at
@@ -78,6 +108,9 @@ run time. However, these assertions do cost time and can thus be turned off.
    See \ref TopicMultiThreading for details.
  - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
    alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
+ - \b EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
+   If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
+   small fixed size vectors or matrices)
  - \b EIGEN_FAST_MATH - enables some optimizations which might affect the accuracy of the result. This currently
    enables the SSE vectorization of sin() and cos(), and speedups sqrt() for single precision. Defined to 1 by default.
    Define it to 0 to disable.
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index ee4f53a4e..fc33b93e7 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -76,6 +76,9 @@ They are summarized in the following tables:
 <tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
     <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
     <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+<tr><td>PardisoLLT \n PardisoLDLT \n PardisoLU</td><td>\link PardisoSupport_Module PardisoSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
+    <td>Requires the <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel MKL</a> package, \b Proprietary </td>
+    <td>optimized for tough problems patterns, see also \link TopicUsingIntelMKL using MKL with Eigen \endlink</td></tr>
 </table>
 
 Here \c SPD means symmetric positive definite.
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index e0a30edcc..a25622e80 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -206,7 +206,7 @@ See \ref TutorialSparse_SubMatrices and below for read-write sub-matrices.
   sm1.innerVectors(start, size);    // RW
   sm1.leftCols(size);               // RW
   sm2.rightCols(size);              // RO because sm2 is row-major
-  sm1.middleRows(start, numRows);   // RO becasue sm1 is column-major
+  sm1.middleRows(start, numRows);   // RO because sm1 is column-major
   sm1.middleCols(start, numCols);   // RW
   sm1.col(j);                       // RW
 \endcode
@@ -253,6 +253,20 @@ If the matrix is not in compressed form, makeCompressed() should be called befor
 Note that these functions are mostly provided for interoperability purposes with external libraries.\n
 A better access to the values of the matrix is done by using the InnerIterator class as described in \link TutorialSparse the Tutorial Sparse \endlink section</td>
 </tr>
+<tr class="alt"><td colspan="2">Mapping external buffers</td></tr>
+<tr class="alt">
+<td>
+\code
+int outerIndexPtr[cols+1];
+int innerIndices[nnz];
+double values[nnz];
+Map<SparseMatrix<double> > sm1(rows,cols,nnz,outerIndexPtr, // read-write
+                               innerIndices,values);
+Map<const SparseMatrix<double> > sm2(...);                  // read-only
+\endcode
+</td>
+<td>As for dense matrices, class Map<SparseMatrixType> can be used to see external buffers as an %Eigen's SparseMatrix object. </td>
+</tr>
 </table>
 */
 }
diff --git a/doc/TopicAssertions.dox b/doc/TopicAssertions.dox
index 4ead40174..c8b4d84f2 100644
--- a/doc/TopicAssertions.dox
+++ b/doc/TopicAssertions.dox
@@ -16,7 +16,7 @@ Both eigen_assert and eigen_plain_assert are defined in Macros.h. Defining eigen
 #include <stdexcept>
 #undef eigen_assert
 #define eigen_assert(x) \
-  if (!x) { throw (std::runtime_error("Put your message here")); }
+  if (!(x)) { throw (std::runtime_error("Put your message here")); }
 \endcode
 
 \subsection DisableAssert Disabling assertions
diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox
index 5bcff2c96..491470627 100644
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@@ -4,6 +4,7 @@ namespace Eigen {
 
 This page presents a catalogue of the dense matrix decompositions offered by Eigen.
 For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink.
+To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
 \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen
 
@@ -256,6 +257,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor
     <dd></dd>
 </dl>
 
+
 */
 
 }
diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
index f0f84d25f..95d95a2d5 100644
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@@ -92,27 +92,28 @@ Note that here, Eigen::Quaternionf is only used as an example, more generally th
 
 \section explanation General explanation of this assertion
 
-\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions adressing them will crash.
+\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions addressing them will crash.
 
 Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their "operator new".
 
 However there are a few corner cases where these alignment settings get overridden: they are the possible causes for this assertion.
 
-\section getrid I don't care about vectorization, how do I get rid of that stuff?
+\section getrid I don't care about optimal vectorization, how do I get rid of that stuff?
 
-Two possibilities:
+Three possibilities:
 <ul>
-  <li>Define EIGEN_DONT_ALIGN_STATICALLY. That disables all 128-bit static alignment code, while keeping 128-bit heap alignment. This has the effect of
-      disabling vectorization for fixed-size objects (like Matrix4d) while keeping vectorization of dynamic-size objects
-      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of 128-bit static alignment.</li>
-  <li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
-      128-bit alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
+  <li>Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way Eigen won't try to align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.</li>
+  <li>Define \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY \endlink. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of
+      vectorizing fixed-size objects (like Matrix4d) through unaligned stores (as controlled by \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink), while keeping unchanged the vectorization of dynamic-size objects
+      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of static alignment.</li>
+  <li>Or define both \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_VECTORIZE \endlink and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
+      16-byte alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
 </ul>
 
-If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 128-bit alignment and the assertion, here's the explanation:
+If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 16-byte alignment and the assertion, here's the explanation:
 
 It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
-It doesn't disable 128bit alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
+It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
 
 */
 
diff --git a/doc/UsingBlasLapackBackends.dox b/doc/UsingBlasLapackBackends.dox
new file mode 100644
index 000000000..caa597122
--- /dev/null
+++ b/doc/UsingBlasLapackBackends.dox
@@ -0,0 +1,133 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2011-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Documentation on the use of BLAS/LAPACK libraries through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingBlasLapack Using BLAS/LAPACK from %Eigen
+
+
+Since %Eigen version 3.3 and later, any F77 compatible BLAS or LAPACK libraries can be used as backends for dense matrix products and dense matrix decompositions.
+For instance, one can use <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php">Intel® MKL</a>, Apple's Accelerate framework on OSX, <a href="http://www.openblas.net/">OpenBLAS</a>, <a href="http://www.netlib.org/lapack">Netlib LAPACK</a>, etc.
+
+Do not miss this \link TopicUsingIntelMKL page \endlink for further discussions on the specific use of Intel® MKL (also includes VML, PARDISO, etc.)
+
+In order to use an external BLAS and/or LAPACK library, you must link you own application to the respective libraries and their dependencies.
+For LAPACK, you must also link to the standard <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> library, which is used as a convenient think layer between %Eigen's C++ code and LAPACK F77 interface. Then you must activate their usage by defining one or multiple of the following macros (\b before including any %Eigen's header):
+
+\note For Mac users, in order to use the lapack version shipped with the Accelerate framework, you also need the lapacke library.
+Using <a href="https://www.macports.org/">MacPorts</a>, this is as easy as:
+\code
+sudo port install lapack
+\endcode
+and then use the following link flags: \c -framework \c Accelerate \c /opt/local/lib/lapack/liblapacke.dylib
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack (compatible with any F77 LAPACK interface)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower numerical robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+</table>
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to BLAS or LAPACK routines.
+These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The breadth of %Eigen functionality that can be substituted is listed in the table below.
+<table class="manual">
+<tr><th>Functional domain</th><th>Code example</th><th>BLAS/LAPACK routines</th></tr>
+<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1*m2.transpose();
+m1.selfadjointView<Lower>()*m2;
+m1*m2.triangularView<Upper>();
+m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
+\endcode</td><td>\code
+?gemm
+?symm/?hemm
+?trmm
+dsyrk/ssyrk
+\endcode</td></tr>
+<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
+m1.adjoint()*b;
+m1.selfadjointView<Lower>()*b;
+m1.triangularView<Upper>()*b;
+\endcode</td><td>\code
+?gemv
+?symv/?hemv
+?trmv
+\endcode</td></tr>
+<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m1.lu().solve(v2);
+\endcode</td><td>\code
+?getrf
+\endcode</td></tr>
+<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+v1 = m2.selfadjointView<Upper>().llt().solve(v2);
+\endcode</td><td>\code
+?potrf
+\endcode</td></tr>
+<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+m1.householderQr();
+m1.colPivHouseholderQr();
+\endcode</td><td>\code
+?geqrf
+?geqp3
+\endcode</td></tr>
+<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
+JacobiSVD<MatrixXd> svd;
+svd.compute(m1, ComputeThinV);
+\endcode</td><td>\code
+?gesvd
+\endcode</td></tr>
+<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+EigenSolver<MatrixXd> es(m1);
+ComplexEigenSolver<MatrixXcd> ces(m1);
+SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
+GeneralizedSelfAdjointEigenSolver<MatrixXd>
+    gsaes(m1+m1.transpose(),m2+m2.transpose());
+\endcode</td><td>\code
+?gees
+?gees
+?syev/?heev
+?syev/?heev,
+?potrf
+\endcode</td></tr>
+<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
+RealSchur<MatrixXd> schurR(m1);
+ComplexSchur<MatrixXcd> schurC(m1);
+\endcode</td><td>\code
+?gees
+\endcode</td></tr>
+</table>
+In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+
+*/
+
+}
diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox
index dbe559e53..a1a3a18f2 100644
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@@ -32,107 +32,45 @@
 
 namespace Eigen {
 
-/** \page TopicUsingIntelMKL Using Intel® Math Kernel Library from Eigen
+/** \page TopicUsingIntelMKL Using Intel® MKL from %Eigen
 
-\section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL)
+<!-- \section TopicUsingIntelMKL_Intro Eigen and Intel® Math Kernel Library (Intel® MKL) -->
+
+Since %Eigen version 3.1 and later, users can benefit from built-in Intel® Math Kernel Library (MKL) optimizations with an installed copy of Intel MKL 10.3 (or later).
 
-Since Eigen version 3.1 and later, users can benefit from built-in Intel MKL optimizations with an installed copy of Intel MKL 10.3 (or later).
 <a href="http://eigen.tuxfamily.org/Counter/redirect_to_mkl.php"> Intel MKL </a> provides highly optimized multi-threaded mathematical routines for x86-compatible architectures.
 Intel MKL is available on Linux, Mac and Windows for both Intel64 and IA32 architectures.
 
 \note
 Intel® MKL is a proprietary software and it is the responsibility of users to buy or register for community (free) Intel MKL licenses for their products. Moreover, the license of the user product has to allow linking to proprietary software that excludes any unmodified versions of the GPL.
 
-Using Intel MKL through Eigen is easy:
--# define the \c EIGEN_USE_MKL_ALL macro before including any Eigen's header
+Using Intel MKL through %Eigen is easy:
+-# define the \c EIGEN_USE_MKL_ALL macro before including any %Eigen's header
 -# link your program to MKL libraries (see the <a href="http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/">MKL linking advisor</a>)
 -# on a 64bits system, you must use the LP64 interface (not the ILP64 one)
 
-When doing so, a number of Eigen's algorithms are silently substituted with calls to Intel MKL routines.
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to Intel MKL routines.
 These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
 Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
 
 In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
 
 <table class="manual">
-<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface, not only Intel MKL)</td></tr>
-<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Intel Lapacke</a> C interface to Lapack (currently works with Intel MKL only)</td></tr>
-<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Lapacke</a> C interface to Lapack</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. \n This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
 <tr class="alt"><td>\c EIGEN_USE_MKL_VML </td><td>Enables the use of Intel VML (vector operations)</td></tr>
 <tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
 </table>
 
+Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
+
 Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
 
-
-\section TopicUsingIntelMKL_SupportedFeatures List of supported features
-
-The breadth of Eigen functionality covered by Intel MKL is listed in the table below.
+The following table summarizes the list of functions covered by \c EIGEN_USE_MKL_VML:
 <table class="manual">
-<tr><th>Functional domain</th><th>Code example</th><th>MKL routines</th></tr>
-<tr><td>Matrix-matrix operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1*m2.transpose();
-m1.selfadjointView<Lower>()*m2;
-m1*m2.triangularView<Upper>();
-m1.selfadjointView<Lower>().rankUpdate(m2,1.0);
-\endcode</td><td>\code
-?gemm
-?symm/?hemm
-?trmm
-dsyrk/ssyrk
-\endcode</td></tr>
-<tr class="alt"><td>Matrix-vector operations \n \c EIGEN_USE_BLAS </td><td>\code
-m1.adjoint()*b;
-m1.selfadjointView<Lower>()*b;
-m1.triangularView<Upper>()*b;
-\endcode</td><td>\code
-?gemv
-?symv/?hemv
-?trmv
-\endcode</td></tr>
-<tr><td>LU decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m1.lu().solve(v2);
-\endcode</td><td>\code
-?getrf
-\endcode</td></tr>
-<tr class="alt"><td>Cholesky decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-v1 = m2.selfadjointView<Upper>().llt().solve(v2);
-\endcode</td><td>\code
-?potrf
-\endcode</td></tr>
-<tr><td>QR decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-m1.householderQr();
-m1.colPivHouseholderQr();
-\endcode</td><td>\code
-?geqrf
-?geqp3
-\endcode</td></tr>
-<tr class="alt"><td>Singular value decomposition \n \c EIGEN_USE_LAPACKE </td><td>\code
-JacobiSVD<MatrixXd> svd;
-svd.compute(m1, ComputeThinV);
-\endcode</td><td>\code
-?gesvd
-\endcode</td></tr>
-<tr><td>Eigen-value decompositions \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-EigenSolver<MatrixXd> es(m1);
-ComplexEigenSolver<MatrixXcd> ces(m1);
-SelfAdjointEigenSolver<MatrixXd> saes(m1+m1.transpose());
-GeneralizedSelfAdjointEigenSolver<MatrixXd>
-    gsaes(m1+m1.transpose(),m2+m2.transpose());
-\endcode</td><td>\code
-?gees
-?gees
-?syev/?heev
-?syev/?heev,
-?potrf
-\endcode</td></tr>
-<tr class="alt"><td>Schur decomposition \n \c EIGEN_USE_LAPACKE \n \c EIGEN_USE_LAPACKE_STRICT </td><td>\code
-RealSchur<MatrixXd> schurR(m1);
-ComplexSchur<MatrixXcd> schurC(m1);
-\endcode</td><td>\code
-?gees
-\endcode</td></tr>
-<tr><td>Vector Math \n \c EIGEN_USE_MKL_VML </td><td>\code
+<tr><th>Code example</th><th>MKL routines</th></tr>
+<tr><td>\code
 v2=v1.array().sin();
 v2=v1.array().asin();
 v2=v1.array().cos();
@@ -156,7 +94,7 @@ v?Sqr
 v?Powx
 \endcode</td></tr>
 </table>
-In the examples, m1 and m2 are dense matrices and v1 and v2 are dense vectors.
+In the examples, v1 and v2 are dense vectors.
 
 
 \section TopicUsingIntelMKL_Links Links
diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css
index 60243d870..6274e6c70 100644
--- a/doc/eigendoxy.css
+++ b/doc/eigendoxy.css
@@ -45,7 +45,7 @@ pre.fragment {
 
 /* Common style for all Eigen's tables */
 
-table.example, table.manual, table.manual-vl {
+table.example, table.manual, table.manual-vl, table.manual-hl {
     max-width:100%;
     border-collapse: collapse;
     border-style: solid;
@@ -58,7 +58,7 @@ table.example, table.manual, table.manual-vl {
     -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
 }
 
-table.example th, table.manual th, table.manual-vl th {
+table.example th, table.manual th, table.manual-vl th, table.manual-hl th {
   padding: 0.5em 0.5em 0.5em 0.5em;
   text-align: left;
   padding-right: 1em;
@@ -70,7 +70,7 @@ table.example th, table.manual th, table.manual-vl th {
   filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFFFFF', endColorstr='#F4F4E5');
 }
 
-table.example td, table.manual td, table.manual-vl td {
+table.example td, table.manual td, table.manual-vl td, table.manual-hl td {
   vertical-align:top;
   border-width: 1px;
   border-color: #cccccc;
@@ -108,15 +108,15 @@ table.example td {
 
 /* standard class for the manual */
 
-table.manual, table.manual-vl {
+table.manual, table.manual-vl, table.manual-hl {
     padding: 0.2em 0em 0.5em 0em;
 }
 
-table.manual th, table.manual-vl th {
+table.manual th, table.manual-vl th, table.manual-hl th {
   margin: 0em 0em 0.3em 0em;
 }
 
-table.manual td, table.manual-vl td {
+table.manual td, table.manual-vl td, table.manual-hl td {
   padding: 0.3em 0.5em 0.3em 0.5em;
   vertical-align:top;
   border-width: 1px;
@@ -136,6 +136,16 @@ table.manual-vl th.inter {
   border-style: solid solid solid solid;
 }
 
+table.manual-hl td {
+  border-color: #cccccc;
+  border-width: 1px;
+  border-style: solid none solid none;
+}
+
+table td.code {
+  font-family: monospace;
+}
+
 h2 {
   margin-top:2em;
   border-style: none none solid none;
@@ -166,6 +176,11 @@ div.toc ul {
   margin: 0.2em 0 0.4em 0.5em;
 }
 
+span.cpp11,span.cpp14,span.cpp17 {
+  color: #119911;
+  font-weight: bold;
+}
+
 /**** old Eigen's styles ****/
 
 
@@ -177,8 +192,8 @@ table.tutorial_code td {
 
 
 /* Whenever doxygen meets a '\n' or a '<BR/>', it will put 
- * the text containing the characted into a <p class="starttd">.
- * This little hack togehter with table.tutorial_code td.note
+ * the text containing the character into a <p class="starttd">.
+ * This little hack together with table.tutorial_code td.note
  * aims at fixing this issue. */
 table.tutorial_code td.note p.starttd {
   margin: 0px;
diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt
index 08cf8efd7..f7a19055f 100644
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
   )
   add_dependencies(all_examples ${example})
 endforeach(example_src)
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
+endif()
\ No newline at end of file
diff --git a/doc/examples/Cwise_erf.cpp b/doc/examples/Cwise_erf.cpp
new file mode 100644
index 000000000..e7cd2c1c0
--- /dev/null
+++ b/doc/examples/Cwise_erf.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erf() << std::endl;
+}
diff --git a/doc/examples/Cwise_erfc.cpp b/doc/examples/Cwise_erfc.cpp
new file mode 100644
index 000000000..d8bb04c30
--- /dev/null
+++ b/doc/examples/Cwise_erfc.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(-0.5,2,0,-7);
+  std::cout << v.erfc() << std::endl;
+}
diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp
new file mode 100644
index 000000000..f1c4f503e
--- /dev/null
+++ b/doc/examples/Cwise_lgamma.cpp
@@ -0,0 +1,9 @@
+#include <Eigen/Core>
+#include <unsupported/Eigen/SpecialFunctions>
+#include <iostream>
+using namespace Eigen;
+int main()
+{
+  Array4d v(0.5,10,0,-1);
+  std::cout << v.lgamma() << std::endl;
+}
\ No newline at end of file
diff --git a/doc/examples/TutorialInplaceLU.cpp b/doc/examples/TutorialInplaceLU.cpp
new file mode 100644
index 000000000..cb9c59b60
--- /dev/null
+++ b/doc/examples/TutorialInplaceLU.cpp
@@ -0,0 +1,61 @@
+#include <iostream>
+struct init {
+  init() { std::cout << "[" << "init" << "]" << std::endl; }
+};
+init init_obj;
+// [init]
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd A(2,2);
+  A << 2, -1, 1, 3;
+  cout << "Here is the input matrix A before decomposition:\n" << A << endl;
+cout << "[init]" << endl;
+
+cout << "[declaration]" << endl;
+  PartialPivLU<Ref<MatrixXd> > lu(A);
+  cout << "Here is the input matrix A after decomposition:\n" << A << endl;
+cout << "[declaration]" << endl;
+
+cout << "[matrixLU]" << endl;
+  cout << "Here is the matrix storing the L and U factors:\n" << lu.matrixLU() << endl;
+cout << "[matrixLU]" << endl;
+
+cout << "[solve]" << endl;
+  MatrixXd A0(2,2); A0 << 2, -1, 1, 3;
+  VectorXd b(2);    b << 1, 2;
+  VectorXd x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[solve]" << endl;
+
+cout << "[modifyA]" << endl;
+  A << 3, 4, -2, 1;
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[modifyA]" << endl;
+
+cout << "[recompute]" << endl;
+  A0 = A; // save A
+  lu.compute(A);
+  x = lu.solve(b);
+  cout << "Residual: " << (A0 * x - b).norm() << endl;
+cout << "[recompute]" << endl;
+
+cout << "[recompute_bis0]" << endl;
+  MatrixXd A1(2,2);
+  A1 << 5,-2,3,4;
+  lu.compute(A1);
+  cout << "Here is the input matrix A1 after decomposition:\n" << A1 << endl;
+cout << "[recompute_bis0]" << endl;
+
+cout << "[recompute_bis1]" << endl;
+  x = lu.solve(b);
+  cout << "Residual: " << (A1 * x - b).norm() << endl;
+cout << "[recompute_bis1]" << endl;
+
+}
diff --git a/doc/examples/make_circulant2.cpp b/doc/examples/make_circulant2.cpp
new file mode 100644
index 000000000..95d3dd31a
--- /dev/null
+++ b/doc/examples/make_circulant2.cpp
@@ -0,0 +1,52 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [circulant_func]
+template<class ArgType>
+class circulant_functor {
+  const ArgType &m_vec;
+public:
+  circulant_functor(const ArgType& arg) : m_vec(arg) {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    Index index = row - col;
+    if (index < 0) index += m_vec.size();
+    return m_vec(index);
+  }
+};
+// [circulant_func]
+
+// [square]
+template<class ArgType>
+struct circulant_helper {
+  typedef Matrix<typename ArgType::Scalar,
+                 ArgType::SizeAtCompileTime,
+                 ArgType::SizeAtCompileTime,
+                 ColMajor,
+                 ArgType::MaxSizeAtCompileTime,
+                 ArgType::MaxSizeAtCompileTime> MatrixType;
+};
+// [square]
+
+// [makeCirculant]
+template <class ArgType>
+CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
+makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
+}
+// [makeCirculant]
+
+// [main]
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
+// [main]
diff --git a/doc/examples/nullary_indexing.cpp b/doc/examples/nullary_indexing.cpp
new file mode 100644
index 000000000..e27c3585a
--- /dev/null
+++ b/doc/examples/nullary_indexing.cpp
@@ -0,0 +1,66 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [functor]
+template<class ArgType, class RowIndexType, class ColIndexType>
+class indexing_functor {
+  const ArgType &m_arg;
+  const RowIndexType &m_rowIndices;
+  const ColIndexType &m_colIndices;
+public:
+  typedef Matrix<typename ArgType::Scalar,
+                 RowIndexType::SizeAtCompileTime,
+                 ColIndexType::SizeAtCompileTime,
+                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
+                 RowIndexType::MaxSizeAtCompileTime,
+                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
+
+  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
+  {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    return m_arg(m_rowIndices[row], m_colIndices[col]);
+  }
+};
+// [functor]
+
+// [function]
+template <class ArgType, class RowIndexType, class ColIndexType>
+CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
+indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+{
+  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
+  typedef typename Func::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
+}
+// [function]
+
+
+int main()
+{
+  std::cout << "[main1]\n";
+  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
+  Array3i ri(1,2,1);
+  ArrayXi ci(6); ci << 3,2,1,0,0,2;
+  Eigen::MatrixXi B = indexing(A, ri, ci);
+  std::cout << "A =" << std::endl;
+  std::cout << A << std::endl << std::endl;
+  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
+  std::cout << B << std::endl;
+  std::cout << "[main1]\n";
+
+  std::cout << "[main2]\n";
+  B =  indexing(A, ri+1, ci);
+  std::cout << "A(ri+1,ci) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#if __cplusplus >= 201103L
+  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#endif
+  std::cout << "[main2]\n";
+}
+
diff --git a/doc/ftv2node.png b/doc/ftv2node.png
new file mode 100644
index 000000000..63c605bb4
Binary files /dev/null and b/doc/ftv2node.png differ
diff --git a/doc/ftv2pnode.png b/doc/ftv2pnode.png
new file mode 100644
index 000000000..c6ee22f93
Binary files /dev/null and b/doc/ftv2pnode.png differ
diff --git a/doc/snippets/CMakeLists.txt b/doc/snippets/CMakeLists.txt
index 1135900cf..1baf32fba 100644
--- a/doc/snippets/CMakeLists.txt
+++ b/doc/snippets/CMakeLists.txt
@@ -24,5 +24,3 @@ foreach(snippet_src ${snippets_SRCS})
   set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
                               PROPERTIES OBJECT_DEPENDS ${snippet_src})
 endforeach(snippet_src)
-
-ei_add_target_property(compile_tut_arithmetic_transpose_aliasing COMPILE_FLAGS -DEIGEN_NO_DEBUG)
diff --git a/doc/snippets/Cwise_erf.cpp b/doc/snippets/Cwise_erf.cpp
deleted file mode 100644
index 7f51c1b6a..000000000
--- a/doc/snippets/Cwise_erf.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-Array4d v(-0.5,2,0,-7);
-cout << v.erf() << endl;
diff --git a/doc/snippets/Cwise_erfc.cpp b/doc/snippets/Cwise_erfc.cpp
deleted file mode 100644
index f0453d4b1..000000000
--- a/doc/snippets/Cwise_erfc.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-Array4d v(-0.5,2,0,-7);
-cout << v.erfc() << endl;
diff --git a/doc/snippets/Cwise_lgamma.cpp b/doc/snippets/Cwise_lgamma.cpp
deleted file mode 100644
index cbc69b989..000000000
--- a/doc/snippets/Cwise_lgamma.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-Array4d v(0.5,10,0,-1);
-cout << v.lgamma() << endl;
\ No newline at end of file
diff --git a/doc/snippets/SparseMatrix_coeffs.cpp b/doc/snippets/SparseMatrix_coeffs.cpp
new file mode 100644
index 000000000..f71a69b07
--- /dev/null
+++ b/doc/snippets/SparseMatrix_coeffs.cpp
@@ -0,0 +1,9 @@
+SparseMatrix<double> A(3,3);
+A.insert(1,2) = 0;
+A.insert(0,1) = 1;
+A.insert(2,0) = 2;
+A.makeCompressed();
+cout << "The matrix A is:" << endl << MatrixXd(A) << endl;
+cout << "it has " << A.nonZeros() << " stored non zero coefficients that are: " << A.coeffs().transpose() << endl;
+A.coeffs() += 10;
+cout << "After adding 10 to every stored non zero coefficient, the matrix A is:" << endl << MatrixXd(A) << endl;
diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in
index fdae39bcf..d63f371a3 100644
--- a/doc/snippets/compile_snippet.cpp.in
+++ b/doc/snippets/compile_snippet.cpp.in
@@ -1,5 +1,8 @@
-#include <Eigen/Eigen>
+static bool eigen_did_assert = false;
+#define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;}
+
 #include <iostream>
+#include <Eigen/Eigen>
 
 #ifndef M_PI
 #define M_PI 3.1415926535897932384626433832795
diff --git a/doc/special_examples/random_cpp11.cpp b/doc/special_examples/random_cpp11.cpp
index adc3c110c..33744c051 100644
--- a/doc/special_examples/random_cpp11.cpp
+++ b/doc/special_examples/random_cpp11.cpp
@@ -7,7 +7,7 @@ using namespace Eigen;
 int main() {
   std::default_random_engine generator;
   std::poisson_distribution<int> distribution(4.1);
-  auto poisson = [&] (Eigen::Index) {return distribution(generator);};
+  auto poisson = [&] () {return distribution(generator);};
 
   RowVectorXi v = RowVectorXi::NullaryExpr(10, poisson );
   std::cout << v << "\n";
diff --git a/lapack/svd.cpp b/lapack/svd.cpp
index df77a371c..77b302b6b 100644
--- a/lapack/svd.cpp
+++ b/lapack/svd.cpp
@@ -124,14 +124,15 @@ EIGEN_LAPACK_FUNC(gesvd,(char *jobu, char *jobv, int *m, int* n, Scalar* a, int
   JacobiSVD<PlainMatrixType> svd(mat,option);
   
   make_vector(s,diag_size) = svd.singularValues().head(diag_size);
-  
+  {
         if(*jobu=='A') matrix(u,*m,*m,*ldu)           = svd.matrixU();
   else  if(*jobu=='S') matrix(u,*m,diag_size,*ldu)    = svd.matrixU();
-  else  if(*jobu=='O') matrix(a,*m,diag_size,*lda)           = svd.matrixU();
-  
+  else  if(*jobu=='O') matrix(a,*m,diag_size,*lda)    = svd.matrixU();
+  }
+  {
         if(*jobv=='A') matrix(vt,*n,*n,*ldvt)         = svd.matrixV().adjoint();
   else  if(*jobv=='S') matrix(vt,diag_size,*n,*ldvt)  = svd.matrixV().adjoint();
   else  if(*jobv=='O') matrix(a,diag_size,*n,*lda)    = svd.matrixV().adjoint();
-    
+  }
   return 0;
 }
diff --git a/scripts/buildtests.in b/scripts/buildtests.in
index d2fd10276..526d5b74b 100755
--- a/scripts/buildtests.in
+++ b/scripts/buildtests.in
@@ -2,7 +2,7 @@
 
 if [[ $# != 1 || $1 == *help ]]
 then
-  echo "usage: ./check regexp"
+  echo "usage: $0 regexp"
   echo "  Builds tests matching the regexp."
   echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
   echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
diff --git a/scripts/check.in b/scripts/check.in
index a90061a57..7717e2d93 100755
--- a/scripts/check.in
+++ b/scripts/check.in
@@ -3,7 +3,7 @@
 
 if [[ $# != 1 || $1 == *help ]]
 then
-  echo "usage: ./check regexp"
+  echo "usage: $0 regexp"
   echo "  Builds and runs tests matching the regexp."
   echo "  The EIGEN_MAKE_ARGS environment variable allows to pass args to 'make'."
   echo "    For example, to launch 5 concurrent builds, use EIGEN_MAKE_ARGS='-j5'"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7bed6a45c..e17985107 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -147,7 +147,7 @@ ei_add_test(nomalloc)
 ei_add_test(first_aligned)
 ei_add_test(nullary)
 ei_add_test(mixingtypes)
-ei_add_test(packetmath)
+ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
 ei_add_test(unalignedassert)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
@@ -258,6 +258,11 @@ ei_add_test(rvalue_types)
 ei_add_test(dense_storage)
 ei_add_test(ctorleak)
 ei_add_test(mpl2only)
+ei_add_test(inplace_decomposition)
+ei_add_test(half_float)
+ei_add_test(array_of_string)
+
+add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
 
 check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
 if(COMPILER_SUPPORT_FASTMATH)
@@ -324,6 +329,16 @@ if(EIGEN_TEST_EIGEN2)
   message(WARNING "The Eigen2 test suite has been removed")
 endif()
 
+# boost MP unit test
+find_package(Boost)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+  ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
+  ei_add_property(EIGEN_TESTED_BACKENDS "Boost.Multiprecision, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "Boost.Multiprecision, ")
+endif()
+
 
 # CUDA unit tests
 option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
@@ -340,7 +355,7 @@ if(CUDA_FOUND)
   
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index 9c895e0ac..bdea51c10 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -169,7 +169,7 @@ void test_adjoint()
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
 
-#ifdef EIGEN_TEST_PART_4
+#ifdef EIGEN_TEST_PART_13
   {
     MatrixXcf a(10,10), b(10,10);
     VERIFY_RAISES_ASSERT(a = a.transpose());
@@ -187,6 +187,13 @@ void test_adjoint()
     a.transpose() = a.adjoint();
     a.transpose() += a.adjoint();
     a.transpose() += a.adjoint() + b;
+
+    // regression tests for check_for_aliasing
+    MatrixXd c(10,10);
+    c = 1.0 * MatrixXd::Ones(10,10) + c;
+    c = MatrixXd::Ones(10,10) * 1.0 + c;
+    c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
+    c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
   }
 #endif
 }
diff --git a/test/array.cpp b/test/array.cpp
index beaa62221..15c3266a9 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -13,6 +13,7 @@ template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
+  typedef typename ArrayType::RealScalar RealScalar;
   typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
   typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
 
@@ -72,7 +73,7 @@ template<typename ArrayType> void array(const ArrayType& m)
   VERIFY_IS_MUCH_SMALLER_THAN(abs(m1.rowwise().sum().sum() - m1.sum()), m1.abs().sum());
   if (!internal::isMuchSmallerThan(abs(m1.sum() - (m1+m2).sum()), m1.abs().sum(), test_precision<Scalar>()))
       VERIFY_IS_NOT_APPROX(((m1+m2).rowwise().sum()).sum(), m1.sum());
-  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar>()));
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
 
   // vector-wise ops
   m3 = m1;
@@ -102,6 +103,22 @@ template<typename ArrayType> void array(const ArrayType& m)
   FixedArrayType f4(f1.data());
   VERIFY_IS_APPROX(f4, f1);
   
+  // pow
+  VERIFY_IS_APPROX(m1.pow(2), m1.square());
+  VERIFY_IS_APPROX(pow(m1,2), m1.square());
+  VERIFY_IS_APPROX(m1.pow(3), m1.cube());
+  VERIFY_IS_APPROX(pow(m1,3), m1.cube());
+  VERIFY_IS_APPROX((-m1).pow(3), -m1.cube());
+  VERIFY_IS_APPROX(pow(2*m1,3), 8*m1.cube());
+  ArrayType exponents = ArrayType::Constant(rows, cols, RealScalar(2));
+  VERIFY_IS_APPROX(Eigen::pow(m1,exponents), m1.square());
+  VERIFY_IS_APPROX(m1.pow(exponents), m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(2*m1,exponents), 4*m1.square());
+  VERIFY_IS_APPROX((2*m1).pow(exponents), 4*m1.square());
+  VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square());
+  VERIFY_IS_APPROX(Eigen::pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0)));
+
   // Check possible conflicts with 1D ctor
   typedef Array<Scalar, Dynamic, 1> OneDArrayType;
   OneDArrayType o1(rows);
@@ -217,12 +234,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
   VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
-#ifdef EIGEN_HAS_C99_MATH
-  VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
-  VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
-  VERIFY_IS_APPROX(m1.erf(), erf(m1));
-  VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
-#endif  // EIGEN_HAS_C99_MATH
+
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
   VERIFY_IS_APPROX(m1.round(), round(m1));
   VERIFY_IS_APPROX(m1.floor(), floor(m1));
@@ -243,7 +255,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   m3 = m1.abs();
   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m1)));
   VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m1)));
+  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m1)));
   VERIFY_IS_APPROX(m3.log(), log(m3));
+  VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
   VERIFY_IS_APPROX(m3.log10(), log10(m3));
 
 
@@ -275,27 +289,12 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   // shift argument of logarithm so that it is not zero
   Scalar smallNumber = NumTraits<Scalar>::dummy_precision();
   VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m1) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber + 1).log() , log1p(abs(m1) + smallNumber));
 
   VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
   VERIFY_IS_APPROX(m1.exp(), exp(m1));
   VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
 
-  VERIFY_IS_APPROX(m1.pow(2), m1.square());
-  VERIFY_IS_APPROX(pow(m1,2), m1.square());
-  VERIFY_IS_APPROX(m1.pow(3), m1.cube());
-  VERIFY_IS_APPROX(pow(m1,3), m1.cube());
-  VERIFY_IS_APPROX((-m1).pow(3), -m1.cube());
-  VERIFY_IS_APPROX(pow(2*m1,3), 8*m1.cube());
-
-  ArrayType exponents = ArrayType::Constant(rows, cols, RealScalar(2));
-  VERIFY_IS_APPROX(Eigen::pow(m1,exponents), m1.square());
-  VERIFY_IS_APPROX(m1.pow(exponents), m1.square());
-  VERIFY_IS_APPROX(Eigen::pow(2*m1,exponents), 4*m1.square());
-  VERIFY_IS_APPROX((2*m1).pow(exponents), 4*m1.square());
-  VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square());
-  VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square());
-  VERIFY_IS_APPROX(pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0)));
-
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
 
@@ -310,122 +309,6 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
   VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
 
-#ifdef EIGEN_HAS_C99_MATH
-  // check special functions (comparing against numpy implementation)
-  if (!NumTraits<Scalar>::IsComplex) {
-    VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329));
-    VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645));
-    VERIFY_IS_APPROX(numext::digamma(Scalar(4)), RealScalar(1.2561176684318));
-    VERIFY_IS_APPROX(numext::digamma(Scalar(-10.5)), RealScalar(2.398239129535781));
-    VERIFY_IS_APPROX(numext::digamma(Scalar(10000.5)), RealScalar(9.210340372392849));
-    VERIFY_IS_EQUAL(numext::digamma(Scalar(0)),
-                    std::numeric_limits<RealScalar>::infinity());
-    VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)),
-                    std::numeric_limits<RealScalar>::infinity());
-    
-    // Check the zeta function against scipy.special.zeta
-    VERIFY_IS_APPROX(numext::zeta(Scalar(1.5), Scalar(2)), RealScalar(1.61237534869));
-    VERIFY_IS_APPROX(numext::zeta(Scalar(4), Scalar(1.5)), RealScalar(0.234848505667));
-    VERIFY_IS_APPROX(numext::zeta(Scalar(10.5), Scalar(3)), RealScalar(1.03086757337e-5));
-    VERIFY_IS_APPROX(numext::zeta(Scalar(10000.5), Scalar(1.0001)), RealScalar(0.367879440865));
-    VERIFY_IS_APPROX(numext::zeta(Scalar(3), Scalar(-2.5)), RealScalar(0.054102025820864097));
-    VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter
-                    std::numeric_limits<RealScalar>::infinity());
-    VERIFY((numext::isnan)(numext::zeta(Scalar(0.9), Scalar(1.2345)))); // The second scalar does not matter
-    
-    // Check the polygamma against scipy.special.polygamma examples
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(2)), RealScalar(0.644934066848));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(3)), RealScalar(0.394934066848));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(25.5)), RealScalar(0.0399946696496));
-    VERIFY((numext::isnan)(numext::polygamma(Scalar(1.5), Scalar(1.2345)))); // The second scalar does not matter
-    
-    // Check the polygamma function over a larger range of values
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(17), Scalar(4.7)), RealScalar(293.334565435));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(31), Scalar(11.8)), RealScalar(0.445487887616));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(28), Scalar(17.7)), RealScalar(-2.47810300902e-07));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(8), Scalar(30.2)), RealScalar(-8.29668781082e-09));
-    /* The following tests only pass for doubles because floats cannot handle the large values of
-       the gamma function.
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(42), Scalar(15.8)), RealScalar(-0.434562276666));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(147), Scalar(54.1)), RealScalar(0.567742190178));
-    VERIFY_IS_APPROX(numext::polygamma(Scalar(170), Scalar(64)), RealScalar(-0.0108615497927));
-    */
-
-    {
-      // Test various propreties of igamma & igammac.  These are normalized
-      // gamma integrals where
-      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
-      //   igamma(a, x) = gamma(a, x) / Gamma(a)
-      // where Gamma and gamma are considered the standard unnormalized
-      // upper and lower incomplete gamma functions, respectively.
-      ArrayType a = m1.abs() + 2;
-      ArrayType x = m2.abs() + 2;
-      ArrayType zero = ArrayType::Zero(rows, cols);
-      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
-      ArrayType a_m1 = a - one;
-      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
-      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
-      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
-      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
-
-      // Gamma(a, 0) == Gamma(a)
-      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
-
-      // Gamma(a, x) + gamma(a, x) == Gamma(a)
-      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
-
-      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
-
-      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
-    }
-
-    // Check exact values of igamma and igammac against a third party calculation.
-    Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-    Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
-    // location i*6+j corresponds to a_s[i], x_s[j].
-    Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-    Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
-                            {0.0, 0.6321205588285578, 0.7768698398515702,
-                             0.9816843611112658, 9.999500016666262e-05, 1.0},
-                            {0.0, 0.4275932955291202, 0.608374823728911,
-                             0.9539882943107686, 7.522076445089201e-07, 1.0},
-                            {0.0, 0.01898815687615381, 0.06564245437845008,
-                             0.5665298796332909, 4.166333347221828e-18, 1.0},
-                            {0.0, 0.9999780593618628, 0.9999899967080838,
-                             0.9999996219837988, 0.9991370418689945, 1.0},
-                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-    Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
-                             {1.0, 0.36787944117144233, 0.22313016014842982,
-                              0.018315638888734182, 0.9999000049998333, 0.0},
-                             {1.0, 0.5724067044708798, 0.3916251762710878,
-                              0.04601170568923136, 0.9999992477923555, 0.0},
-                             {1.0, 0.9810118431238462, 0.9343575456215499,
-                              0.4334701203667089, 1.0, 0.0},
-                             {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-                              3.7801620118431334e-07, 0.0008629581310054535,
-                              0.0},
-                             {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
-    for (int i = 0; i < 6; ++i) {
-      for (int j = 0; j < 6; ++j) {
-        if ((std::isnan)(igamma_s[i][j])) {
-          VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
-        } else {
-          VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
-        }
-
-        if ((std::isnan)(igammac_s[i][j])) {
-          VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
-        } else {
-          VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
-        }
-      }
-    }
-  }
-#endif  // EIGEN_HAS_C99_MATH
-
   // check inplace transpose
   m3 = m1;
   m3.transposeInPlace();
@@ -525,7 +408,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
 
   // scalar by array division
   Scalar  s1 = internal::random<Scalar>();
-  const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
+  const RealScalar tiny = std::sqrt(std::numeric_limits<RealScalar>::epsilon());
   s1 += Scalar(tiny);
   m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
   VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
@@ -605,7 +488,7 @@ void test_array()
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<int>::type, int >::value));
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<float>::type, float >::value));
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Array2i>::type, ArrayBase<Array2i> >::value));
-  typedef CwiseUnaryOp<internal::scalar_multiple_op<double>, ArrayXd > Xpr;
+  typedef CwiseUnaryOp<internal::scalar_abs_op<double>, ArrayXd > Xpr;
   VERIFY((internal::is_same< internal::global_math_functions_filtering_base<Xpr>::type,
                            ArrayBase<Xpr>
                          >::value));
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index db5f3b34a..97e03be83 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -45,7 +45,7 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
   VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum().sum() - m1.sum(), m1.squaredNorm());
   VERIFY_IS_MUCH_SMALLER_THAN(m1.colwise().sum() + m2.colwise().sum() - (m1+m2).colwise().sum(), (m1+m2).squaredNorm());
   VERIFY_IS_MUCH_SMALLER_THAN(m1.rowwise().sum() - m2.rowwise().sum() - (m1-m2).rowwise().sum(), (m1-m2).squaredNorm());
-  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar>()));
+  VERIFY_IS_APPROX(m1.colwise().sum(), m1.colwise().redux(internal::scalar_sum_op<Scalar,Scalar>()));
 
   // vector-wise ops
   m3 = m1;
@@ -144,9 +144,21 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
 template<typename VectorType> void lpNorm(const VectorType& v)
 {
   using std::sqrt;
+  typedef typename VectorType::RealScalar RealScalar;
   VectorType u = VectorType::Random(v.size());
 
-  VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwiseAbs().maxCoeff());
+  if(v.size()==0)
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<1>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<2>(), RealScalar(0));
+    VERIFY_IS_APPROX(u.template lpNorm<5>(), RealScalar(0));
+  }
+  else
+  {
+    VERIFY_IS_APPROX(u.template lpNorm<Infinity>(), u.cwiseAbs().maxCoeff());
+  }
+
   VERIFY_IS_APPROX(u.template lpNorm<1>(), u.cwiseAbs().sum());
   VERIFY_IS_APPROX(u.template lpNorm<2>(), sqrt(u.array().abs().square().sum()));
   VERIFY_IS_APPROX(numext::pow(u.template lpNorm<5>(), typename VectorType::RealScalar(5)), u.array().abs().pow(5).sum());
@@ -255,6 +267,8 @@ void test_array_for_matrix()
     CALL_SUBTEST_5( lpNorm(VectorXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_4( lpNorm(VectorXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
+  CALL_SUBTEST_5( lpNorm(VectorXf(0)) );
+  CALL_SUBTEST_4( lpNorm(VectorXcf(0)) );
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_4( resize(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_5( resize(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp
new file mode 100644
index 000000000..e23b7c59e
--- /dev/null
+++ b/test/array_of_string.cpp
@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void test_array_of_string()
+{
+  typedef Array<std::string,1,Dynamic> ArrayXs;
+  ArrayXs a1(3), a2(3), a3(3), a3ref(3);
+  a1 << "one", "two", "three";
+  a2 << "1", "2", "3";
+  a3ref << "one (1)", "two (2)", "three (3)";
+  std::stringstream s1;
+  s1 << a1;
+  VERIFY_IS_EQUAL(s1.str(), std::string("  one    two  three"));
+  a3 = a1 + std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a3 = a1;
+  a3 += std::string(" (") + a2 + std::string(")");
+  VERIFY((a3==a3ref).all());
+
+  a1.swap(a3);
+  VERIFY((a1==a3ref).all());
+  VERIFY((a3!=a3ref).all());
+}
diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp
index a5c0d37f9..c9d9f90c3 100644
--- a/test/array_reverse.cpp
+++ b/test/array_reverse.cpp
@@ -117,13 +117,11 @@ template<typename MatrixType> void reverse(const MatrixType& m)
   m2.colwise().reverseInPlace();
   VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval());
 
-  /*
   m1.colwise().reverse()(r, c) = x;
   VERIFY_IS_APPROX(x, m1(rows - 1 - r, c));
 
   m1.rowwise().reverse()(r, c) = x;
   VERIFY_IS_APPROX(x, m1(r, cols - 1 - c));
-  */
 }
 
 void test_array_reverse()
diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp
new file mode 100644
index 000000000..e06e9bdaf
--- /dev/null
+++ b/test/boostmultiprec.cpp
@@ -0,0 +1,201 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#ifdef EIGEN_TEST_MAX_SIZE
+#undef EIGEN_TEST_MAX_SIZE
+#endif
+
+#define EIGEN_TEST_MAX_SIZE 50
+
+#ifdef EIGEN_TEST_PART_1
+#include "cholesky.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#include "lu.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+#include "qr.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_4
+#include "qr_colpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_5
+#include "qr_fullpivoting.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_6
+#include "eigensolver_selfadjoint.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_7
+#include "eigensolver_generic.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_8
+#include "eigensolver_generalized_real.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_9
+#include "jacobisvd.cpp"
+#endif
+
+#ifdef EIGEN_TEST_PART_10
+#include "bdcsvd.cpp"
+#endif
+
+#include <Eigen/Dense>
+
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+
+#include <boost/multiprecision/cpp_dec_float.hpp>
+#include <boost/multiprecision/number.hpp>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/complex.hpp>
+
+namespace mp = boost::multiprecision;
+typedef mp::number<mp::cpp_dec_float<100>, mp::et_on> Real;
+
+namespace Eigen {
+  template<> struct NumTraits<Real> : GenericNumTraits<Real> {
+    static inline Real dummy_precision() { return 1e-50; }
+  };
+
+  template<typename T1,typename T2,typename T3,typename T4,typename T5>
+  struct NumTraits<boost::multiprecision::detail::expression<T1,T2,T3,T4,T5> > : NumTraits<Real> {};
+
+  template<>
+  Real test_precision<Real>() { return 1e-50; }
+
+  // needed in C++93 mode where number does not support explicit cast.
+  namespace internal {
+    template<typename NewType>
+    struct cast_impl<Real,NewType> {
+      static inline NewType run(const Real& x) {
+        return x.template convert_to<NewType>();
+      }
+    };
+
+    template<>
+    struct cast_impl<Real,std::complex<Real> > {
+      static inline std::complex<Real>  run(const Real& x) {
+        return std::complex<Real>(x);
+      }
+    };
+  }
+}
+
+namespace boost {
+namespace multiprecision {
+  // to make ADL works as expected:
+  using boost::math::isfinite;
+  using boost::math::isnan;
+  using boost::math::isinf;
+  using boost::math::copysign;
+  using boost::math::hypot;
+
+  // The following is needed for std::complex<Real>:
+  Real fabs(const Real& a) { return abs EIGEN_NOT_A_MACRO (a); }
+  Real fmax(const Real& a, const Real& b) { using std::max; return max(a,b); }
+
+  // some specialization for the unit tests:
+  inline bool test_isMuchSmallerThan(const Real& a, const Real& b) {
+    return internal::isMuchSmallerThan(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApprox(const Real& a, const Real& b) {
+    return internal::isApprox(a, b, test_precision<Real>());
+  }
+
+  inline bool test_isApproxOrLessThan(const Real& a, const Real& b) {
+    return internal::isApproxOrLessThan(a, b, test_precision<Real>());
+  }
+
+  Real get_test_precision(const Real&) {
+    return test_precision<Real>();
+  }
+
+  Real test_relative_error(const Real &a, const Real &b) {
+    using Eigen::numext::abs2;
+    return sqrt(abs2<Real>(a-b)/Eigen::numext::mini<Real>(abs2(a),abs2(b)));
+  }
+}
+}
+
+namespace Eigen {
+
+}
+
+void test_boostmultiprec()
+{
+  typedef Matrix<Real,Dynamic,Dynamic> Mat;
+  typedef Matrix<std::complex<Real>,Dynamic,Dynamic> MatC;
+
+  std::cout << "NumTraits<Real>::epsilon()         = " << NumTraits<Real>::epsilon() << std::endl;
+  std::cout << "NumTraits<Real>::dummy_precision() = " << NumTraits<Real>::dummy_precision() << std::endl;
+  std::cout << "NumTraits<Real>::lowest()          = " << NumTraits<Real>::lowest() << std::endl;
+  std::cout << "NumTraits<Real>::highest()         = " << NumTraits<Real>::highest() << std::endl;
+  std::cout << "NumTraits<Real>::digits10()        = " << NumTraits<Real>::digits10() << std::endl;
+
+  // chekc stream output
+  {
+    Mat A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+  {
+    MatC A(10,10);
+    A.setRandom();
+    std::stringstream ss;
+    ss << A;
+  }
+
+  for(int i = 0; i < g_repeat; i++) {
+    int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+
+    CALL_SUBTEST_1( cholesky(Mat(s,s)) );
+
+    CALL_SUBTEST_2( lu_non_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_invertible<Mat>() );
+    CALL_SUBTEST_2( lu_non_invertible<MatC>() );
+    CALL_SUBTEST_2( lu_invertible<MatC>() );
+
+    CALL_SUBTEST_3( qr(Mat(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_4( qr<Mat>() );
+    CALL_SUBTEST_4( cod<Mat>() );
+    CALL_SUBTEST_4( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_5( qr<Mat>() );
+    CALL_SUBTEST_5( qr_invertible<Mat>() );
+
+    CALL_SUBTEST_6( selfadjointeigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_7( eigensolver(Mat(s,s)) );
+
+    CALL_SUBTEST_8( generalized_eigensolver_real(Mat(s,s)) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+  }
+
+  CALL_SUBTEST_9(( jacobisvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+  CALL_SUBTEST_10(( bdcsvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
+}
+
diff --git a/test/bug1213.cpp b/test/bug1213.cpp
new file mode 100644
index 000000000..581760c1a
--- /dev/null
+++ b/test/bug1213.cpp
@@ -0,0 +1,13 @@
+
+// This anonymous enum is essential to trigger the linking issue
+enum {
+  Foo
+};
+
+#include "bug1213.h"
+
+bool bug1213_1(const Eigen::Vector3f& x)
+{
+  return bug1213_2(x);
+}
+
diff --git a/test/bug1213.h b/test/bug1213.h
new file mode 100644
index 000000000..040e5a470
--- /dev/null
+++ b/test/bug1213.h
@@ -0,0 +1,8 @@
+
+#include <Eigen/Core>
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& x);
+
+bool bug1213_1(const Eigen::Vector3f& x);
+
diff --git a/test/bug1213_main.cpp b/test/bug1213_main.cpp
new file mode 100644
index 000000000..4802c0003
--- /dev/null
+++ b/test/bug1213_main.cpp
@@ -0,0 +1,18 @@
+
+// This is a regression unit regarding a weird linking issue with gcc.
+
+#include "bug1213.h"
+
+int main()
+{
+  return 0;
+}
+
+
+template<typename T, int dim>
+bool bug1213_2(const Eigen::Matrix<T,dim,1>& )
+{
+  return true;
+}
+
+template bool bug1213_2<float,3>(const Eigen::Vector3f&);
diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index b7abc230b..8ad5ac639 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -154,6 +154,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     SquareMatrixType symmLo = symm.template triangularView<Lower>();
 
     LDLT<SquareMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
     vecX = ldltlo.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
@@ -170,6 +171,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
 
 
     LDLT<SquareMatrixType,Upper> ldltup(symmUp);
+    VERIFY(ldltup.info()==Success);
     VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix());
     vecX = ldltup.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
@@ -243,11 +245,13 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     // check matrices with a wide spectrum
     if(rows>=3)
     {
+      using std::pow;
+      using std::sqrt;
       RealScalar s = (std::min)(16,std::numeric_limits<RealScalar>::max_exponent10/8);
       Matrix<Scalar,Dynamic,Dynamic> a = Matrix<Scalar,Dynamic,Dynamic>::Random(rows,rows);
       Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(rows);
       for(Index k=0; k<rows; ++k)
-        d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+        d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
       SquareMatrixType A = a * d.asDiagonal() * a.adjoint();
       // Make sure a solution exists:
       vecX.setRandom();
@@ -263,7 +267,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       }
       else
       {
-        RealScalar large_tol =  std::sqrt(test_precision<RealScalar>());
+        RealScalar large_tol =  sqrt(test_precision<RealScalar>());
         VERIFY((A * vecX).isApprox(vecB, large_tol));
 
         ++g_test_level;
@@ -329,6 +333,7 @@ template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
     RealMatrixType symmLo = symm.template triangularView<Lower>();
 
     LDLT<RealMatrixType,Lower> ldltlo(symmLo);
+    VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
     vecX = ldltlo.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
@@ -365,35 +370,90 @@ template<typename MatrixType> void cholesky_definiteness(const MatrixType& m)
   {
     mat << 1, 0, 0, -1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
   {
     mat << 1, 2, 2, 1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
   {
     mat << 0, 0, 0, 0;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(ldlt.isNegative());
     VERIFY(ldlt.isPositive());
   }
   {
     mat << 0, 0, 0, 1;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(!ldlt.isNegative());
     VERIFY(ldlt.isPositive());
   }
   {
     mat << -1, 0, 0, 0;
     ldlt.compute(mat);
+    VERIFY(ldlt.info()==Success);
     VERIFY(ldlt.isNegative());
     VERIFY(!ldlt.isPositive());
   }
 }
 
+template<typename>
+void cholesky_faillure_cases()
+{
+  MatrixXd mat;
+  LDLT<MatrixXd> ldlt;
+
+  {
+    mat.resize(2,2);
+    mat << 0, 1, 1, 0;
+    ldlt.compute(mat);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+    VERIFY(ldlt.info()==NumericalIssue);
+  }
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
+  {
+    mat.resize(3,3);
+    mat << -1, -3, 3,
+           -3, -8.9999999999999999999, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+#endif
+  {
+    mat.resize(3,3);
+    mat <<  1, 2, 3,
+            2, 4, 1,
+            3, 1, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+
+  {
+    mat.resize(8,8);
+    mat <<  0.1, 0, -0.1, 0, 0, 0, 1, 0,
+            0, 4.24667, 0, 2.00333, 0, 0, 0, 0,
+            -0.1, 0, 0.2, 0, -0.1, 0, 0, 0,
+            0, 2.00333, 0, 8.49333, 0, 2.00333, 0, 0,
+            0, 0, -0.1, 0, 0.1, 0, 0, 1,
+            0, 0, 0, 2.00333, 0, 4.24667, 0, 0,
+            1, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 1, 0, 0, 0;
+    ldlt.compute(mat);
+    VERIFY(ldlt.info()==NumericalIssue);
+    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
+  }
+}
+
 template<typename MatrixType> void cholesky_verify_assert()
 {
   MatrixType tmp;
@@ -443,5 +503,7 @@ void test_cholesky()
   CALL_SUBTEST_9( LLT<MatrixXf>(10) );
   CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
 
+  CALL_SUBTEST_2( cholesky_faillure_cases<void>() );
+
   TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
 }
diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp
index 99102b966..9844adbd2 100644
--- a/test/commainitializer.cpp
+++ b/test/commainitializer.cpp
@@ -9,6 +9,62 @@
 
 #include "main.h"
 
+
+template<int M1, int M2, int N1, int N2>
+void test_blocks()
+{
+  Matrix<int, M1+M2, N1+N2> m_fixed;
+  MatrixXi m_dynamic(M1+M2, N1+N2);
+
+  Matrix<int, M1, N1> mat11; mat11.setRandom();
+  Matrix<int, M1, N2> mat12; mat12.setRandom();
+  Matrix<int, M2, N1> mat21; mat21.setRandom();
+  Matrix<int, M2, N2> mat22; mat22.setRandom();
+
+  MatrixXi matx11 = mat11, matx12 = mat12, matx21 = mat21, matx22 = mat22;
+
+  {
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat21, matx22).finished(), (m_dynamic << mat11, matx12, mat21, matx22).finished());
+    VERIFY_IS_EQUAL((m_fixed.template topLeftCorner<M1,N1>()), mat11);
+    VERIFY_IS_EQUAL((m_fixed.template topRightCorner<M1,N2>()), mat12);
+    VERIFY_IS_EQUAL((m_fixed.template bottomLeftCorner<M2,N1>()), mat21);
+    VERIFY_IS_EQUAL((m_fixed.template bottomRightCorner<M2,N2>()), mat22);
+    VERIFY_IS_EQUAL((m_fixed << mat12, mat11, matx21, mat22).finished(), (m_dynamic << mat12, matx11, matx21, mat22).finished());
+  }
+
+  if(N1 > 0)
+  {
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat11, mat21, mat22));
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat21, mat21, mat22));
+  }
+  else
+  {
+    // allow insertion of zero-column blocks:
+    VERIFY_IS_EQUAL((m_fixed << mat11, mat12, mat11, mat11, mat21, mat21, mat22).finished(), (m_dynamic << mat12, mat22).finished());
+  }
+  if(M1 != M2)
+  {
+    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat21, mat12, mat22));
+  }
+}
+
+
+template<int N>
+struct test_block_recursion
+{
+  static void run()
+  {
+    test_blocks<(N>>6)&3, (N>>4)&3, (N>>2)&3, N & 3>();
+    test_block_recursion<N-1>::run();
+  }
+};
+
+template<>
+struct test_block_recursion<-1>
+{
+  static void run() { }
+};
+
 void test_commainitializer()
 {
   Matrix3d m3;
@@ -43,4 +99,8 @@ void test_commainitializer()
         4, 5, 6,
         vec[2].transpose();
   VERIFY_IS_APPROX(m3, ref);
+
+
+  // recursively test all block-sizes from 0 to 3:
+  test_block_recursion<(1<<8) - 1>();
 }
diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu
index b36ed888d..cb2e4167a 100644
--- a/test/cuda_basic.cu
+++ b/test/cuda_basic.cu
@@ -1,4 +1,11 @@
-
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 // workaround issue between gcc >= 4.7 and cuda 5.5
 #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
@@ -12,10 +19,15 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 
 #include <math_constants.h>
+#include <cuda.h>
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include "cuda_common.h"
 
-#include <Eigen/Eigenvalues>
+// Check that dense modules can be properly parsed by nvcc
+#include <Eigen/Dense>
 
 // struct Foo{
 //   EIGEN_DEVICE_FUNC
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index 5f587007c..f1cc70bee 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -22,7 +22,7 @@ void check_handmade_aligned_malloc()
   for(int i = 1; i < 1000; i++)
   {
     char *p = (char*)internal::handmade_aligned_malloc(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::handmade_aligned_free(p);
@@ -34,7 +34,7 @@ void check_aligned_malloc()
   for(int i = ALIGNMENT; i < 1000; i++)
   {
     char *p = (char*)internal::aligned_malloc(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::aligned_free(p);
@@ -46,7 +46,7 @@ void check_aligned_new()
   for(int i = ALIGNMENT; i < 1000; i++)
   {
     float *p = internal::aligned_new<float>(i);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
     internal::aligned_delete(p,i);
@@ -58,7 +58,7 @@ void check_aligned_stack_alloc()
   for(int i = ALIGNMENT; i < 400; i++)
   {
     ei_declare_aligned_stack_constructed_variable(float,p,i,0);
-    VERIFY(size_t(p)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(p)%ALIGNMENT==0);
     // if the buffer is wrongly allocated this will give a bad write --> check with valgrind
     for(int j = 0; j < i; j++) p[j]=0;
   }
@@ -88,7 +88,7 @@ template<typename T> void check_dynaligned()
   {
     T* obj = new T;
     VERIFY(T::NeedsToAlign==1);
-    VERIFY(size_t(obj)%ALIGNMENT==0);
+    VERIFY(internal::UIntPtr(obj)%ALIGNMENT==0);
     delete obj;
   }
 }
@@ -148,15 +148,15 @@ void test_dynalloc()
   }
 
   {
-    MyStruct foo0;  VERIFY(size_t(foo0.avec.data())%ALIGNMENT==0);
-    MyClassA fooA;  VERIFY(size_t(fooA.avec.data())%ALIGNMENT==0);
+    MyStruct foo0;  VERIFY(internal::UIntPtr(foo0.avec.data())%ALIGNMENT==0);
+    MyClassA fooA;  VERIFY(internal::UIntPtr(fooA.avec.data())%ALIGNMENT==0);
   }
   
   // dynamic allocation, single object
   for (int i=0; i<g_repeat*100; ++i)
   {
-    MyStruct *foo0 = new MyStruct();  VERIFY(size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA();  VERIFY(size_t(fooA->avec.data())%ALIGNMENT==0);
+    MyStruct *foo0 = new MyStruct();  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA();  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
     delete foo0;
     delete fooA;
   }
@@ -165,8 +165,8 @@ void test_dynalloc()
   const int N = 10;
   for (int i=0; i<g_repeat*100; ++i)
   {
-    MyStruct *foo0 = new MyStruct[N];  VERIFY(size_t(foo0->avec.data())%ALIGNMENT==0);
-    MyClassA *fooA = new MyClassA[N];  VERIFY(size_t(fooA->avec.data())%ALIGNMENT==0);
+    MyStruct *foo0 = new MyStruct[N];  VERIFY(internal::UIntPtr(foo0->avec.data())%ALIGNMENT==0);
+    MyClassA *fooA = new MyClassA[N];  VERIFY(internal::UIntPtr(fooA->avec.data())%ALIGNMENT==0);
     delete[] foo0;
     delete[] fooA;
   }
diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index a46a2e50e..9c0838ba4 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -1,15 +1,17 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_RUNTIME_NO_MALLOC
 #include "main.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
+#include <Eigen/LU>
 
 template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
 {
@@ -21,6 +23,7 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   Index cols = m.cols();
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef std::complex<Scalar> ComplexScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
 
   MatrixType a = MatrixType::Random(rows,cols);
@@ -31,14 +34,41 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   MatrixType spdB =  b.adjoint() * b + b1.adjoint() * b1;
 
   // lets compare to GeneralizedSelfAdjointEigenSolver
-  GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
-  GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
+    GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
 
-  VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
+    VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
 
-  VectorType realEigenvalues = eig.eigenvalues().real();
-  std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
-  VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+    VectorType realEigenvalues = eig.eigenvalues().real();
+    std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
+    VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(spdA*V, spdB*V*D);
+  }
+
+  // non symmetric case:
+  {
+    GeneralizedEigenSolver<MatrixType> eig(rows);
+    // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+    //Eigen::internal::set_is_malloc_allowed(false);
+    eig.compute(a,b);
+    //Eigen::internal::set_is_malloc_allowed(true);
+    for(Index k=0; k<cols; ++k)
+    {
+      Matrix<ComplexScalar,Dynamic,Dynamic> tmp = (eig.betas()(k)*a).template cast<ComplexScalar>() - eig.alphas()(k)*b;
+      if(tmp.size()>1 && tmp.norm()>(std::numeric_limits<Scalar>::min)())
+        tmp /= tmp.norm();
+      VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) );
+    }
+    // check eigenvectors
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType D = eig.eigenvalues().asDiagonal();
+    typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType V = eig.eigenvectors();
+    VERIFY_IS_APPROX(a*V, b*V*D);
+  }
 
   // regression test for bug 1098
   {
@@ -57,7 +87,7 @@ void test_eigensolver_generalized_real()
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) );
 
-    // some trivial but implementation-wise tricky cases
+    // some trivial but implementation-wise special cases
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) );
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
     CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );
diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index 566546310..e18fbf687 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@@ -127,16 +127,29 @@ void test_eigensolver_generic()
   }
   );
   
-  // regression test for bug 793
 #ifdef EIGEN_TEST_PART_2
   {
-     MatrixXd a(3,3);
-     a << 0,  0,  1,
-          1,  1, 1,
-          1, 1e+200,  1;
-     Eigen::EigenSolver<MatrixXd> eig(a);
-     VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix());
-     VERIFY_IS_APPROX(a * eig.eigenvectors(), eig.eigenvectors() * eig.eigenvalues().asDiagonal());
+    // regression test for bug 793
+    MatrixXd a(3,3);
+    a << 0,  0,  1,
+        1,  1, 1,
+        1, 1e+200,  1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    double scale = 1e-200; // scale to avoid overflow during the comparisons
+    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
+    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
+  }
+  {
+    // check a case where all eigenvalues are null.
+    MatrixXd a(2,2);
+    a << 1,  1,
+        -1, -1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
+    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
+    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
   }
 #endif
   
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index f909761a1..4ed126116 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -12,18 +12,29 @@
 #include "svd_fill.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
+#include <Eigen/SparseCore>
 
 
 template<typename MatrixType> void selfadjointeigensolver_essential_check(const MatrixType& m)
 {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  RealScalar eival_eps = (std::min)(test_precision<RealScalar>(),  NumTraits<Scalar>::dummy_precision()*20000);
+  RealScalar eival_eps = numext::mini<RealScalar>(test_precision<RealScalar>(),  NumTraits<Scalar>::dummy_precision()*20000);
   
   SelfAdjointEigenSolver<MatrixType> eiSymm(m);
   VERIFY_IS_EQUAL(eiSymm.info(), Success);
-  VERIFY_IS_APPROX(m.template selfadjointView<Lower>() * eiSymm.eigenvectors(),
-                   eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal());
+
+  RealScalar scaling = m.cwiseAbs().maxCoeff();
+
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(eiSymm.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiSymm.eigenvectors())/scaling,
+                     (eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal())/scaling);
+  }
   VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
   VERIFY_IS_UNITARY(eiSymm.eigenvectors());
 
@@ -32,7 +43,6 @@ template<typename MatrixType> void selfadjointeigensolver_essential_check(const
     SelfAdjointEigenSolver<MatrixType> eiDirect;
     eiDirect.computeDirect(m);  
     VERIFY_IS_EQUAL(eiDirect.info(), Success);
-    VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiDirect.eigenvalues());
     if(! eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps) )
     {
       std::cerr << "reference eigenvalues: " << eiSymm.eigenvalues().transpose() << "\n"
@@ -40,10 +50,18 @@ template<typename MatrixType> void selfadjointeigensolver_essential_check(const
                 << "diff:                  " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).transpose() << "\n"
                 << "error (eps):           " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).norm() / eiSymm.eigenvalues().norm() << "  (" << eival_eps << ")\n";
     }
-    VERIFY(eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps));
-    VERIFY_IS_APPROX(m.template selfadjointView<Lower>() * eiDirect.eigenvectors(),
-                    eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal());
-    VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiDirect.eigenvalues());
+    if(scaling<(std::numeric_limits<RealScalar>::min)())
+    {
+      VERIFY(eiDirect.eigenvalues().cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+    }
+    else
+    {
+      VERIFY_IS_APPROX(eiSymm.eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+      VERIFY_IS_APPROX((m.template selfadjointView<Lower>() * eiDirect.eigenvectors())/scaling,
+                       (eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal())/scaling);
+      VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues()/scaling, eiDirect.eigenvalues()/scaling);
+    }
+
     VERIFY_IS_UNITARY(eiDirect.eigenvectors());
   }
 }
@@ -164,6 +182,7 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   }
 }
 
+template<int>
 void bug_854()
 {
   Matrix3d m;
@@ -173,6 +192,7 @@ void bug_854()
   selfadjointeigensolver_essential_check(m);
 }
 
+template<int>
 void bug_1014()
 {
   Matrix3d m;
@@ -182,6 +202,26 @@ void bug_1014()
   selfadjointeigensolver_essential_check(m);
 }
 
+template<int>
+void bug_1225()
+{
+  Matrix3d m1, m2;
+  m1.setRandom();
+  m1 = m1*m1.transpose();
+  m2 = m1.triangularView<Upper>();
+  SelfAdjointEigenSolver<Matrix3d> eig1(m1);
+  SelfAdjointEigenSolver<Matrix3d> eig2(m2.selfadjointView<Upper>());
+  VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
+}
+
+template<int>
+void bug_1204()
+{
+  SparseMatrix<double> A(2,2);
+  A.setIdentity();
+  SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
+}
+
 void test_eigensolver_selfadjoint()
 {
   int s = 0;
@@ -210,8 +250,10 @@ void test_eigensolver_selfadjoint()
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
   
-  CALL_SUBTEST_13( bug_854() );
-  CALL_SUBTEST_13( bug_1014() );
+  CALL_SUBTEST_13( bug_854<0>() );
+  CALL_SUBTEST_13( bug_1014<0>() );
+  CALL_SUBTEST_13( bug_1204<0>() );
+  CALL_SUBTEST_13( bug_1225<0>() );
 
   // Test problem size constructors
   s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
diff --git a/test/evaluators.cpp b/test/evaluators.cpp
index 876dffe22..aed5a05a7 100644
--- a/test/evaluators.cpp
+++ b/test/evaluators.cpp
@@ -21,7 +21,7 @@ namespace Eigen {
   EIGEN_STRONG_INLINE
   DstXprType& copy_using_evaluator(const EigenBase<DstXprType> &dst, const SrcXprType &src)
   {
-    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
     return dst.const_cast_derived();
   }
   
@@ -29,7 +29,7 @@ namespace Eigen {
   EIGEN_STRONG_INLINE
   const DstXprType& copy_using_evaluator(const NoAlias<DstXprType, StorageBase>& dst, const SrcXprType &src)
   {
-    call_assignment(dst, src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+    call_assignment(dst, src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
     return dst.expression();
   }
   
@@ -45,7 +45,7 @@ namespace Eigen {
     dst.const_cast_derived().resizeLike(src.derived());
   #endif
     
-    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar>());
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
     return dst.const_cast_derived();
   }
 
@@ -53,28 +53,28 @@ namespace Eigen {
   void add_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
   {
     typedef typename DstXprType::Scalar Scalar;
-    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::add_assign_op<Scalar>());
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::add_assign_op<Scalar,typename SrcXprType::Scalar>());
   }
 
   template<typename DstXprType, typename SrcXprType>
   void subtract_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
   {
     typedef typename DstXprType::Scalar Scalar;
-    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::sub_assign_op<Scalar>());
+    call_assignment(const_cast<DstXprType&>(dst), src.derived(), internal::sub_assign_op<Scalar,typename SrcXprType::Scalar>());
   }
 
   template<typename DstXprType, typename SrcXprType>
   void multiply_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
   {
     typedef typename DstXprType::Scalar Scalar;
-    call_assignment(dst.const_cast_derived(), src.derived(), internal::mul_assign_op<Scalar>());
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::mul_assign_op<Scalar,typename SrcXprType::Scalar>());
   }
 
   template<typename DstXprType, typename SrcXprType>
   void divide_assign_using_evaluator(const DstXprType& dst, const SrcXprType& src)
   {
     typedef typename DstXprType::Scalar Scalar;
-    call_assignment(dst.const_cast_derived(), src.derived(), internal::div_assign_op<Scalar>());
+    call_assignment(dst.const_cast_derived(), src.derived(), internal::div_assign_op<Scalar,typename SrcXprType::Scalar>());
   }
   
   template<typename DstXprType, typename SrcXprType>
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
index efdd5b313..cc5db0746 100644
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
     VERIFY( !m.allFinite() );
     VERIFY(  m.hasNaN() );
   }
-  m(4) /= 0.0;
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
+  m(4) /= hidden_zero;
   if(dryrun)
   {
     std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp
index bf22f6b97..ae2d4bc42 100644
--- a/test/first_aligned.cpp
+++ b/test/first_aligned.cpp
@@ -41,7 +41,7 @@ void test_first_aligned()
   test_first_aligned_helper(array_double+1, 50);
   test_first_aligned_helper(array_double+2, 50);
   
-  double *array_double_plus_4_bytes = (double*)(size_t(array_double)+4);
+  double *array_double_plus_4_bytes = (double*)(internal::UIntPtr(array_double)+4);
   test_none_aligned_helper(array_double_plus_4_bytes, 50);
   test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
   
diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
index 2bdb4b7f2..d2339a651 100644
--- a/test/geo_alignedbox.cpp
+++ b/test/geo_alignedbox.cpp
@@ -48,6 +48,8 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
   b0.extend(p0);
   b0.extend(p1);
   VERIFY(b0.contains(p0*s1+(Scalar(1)-s1)*p1));
+  VERIFY(b0.contains(b0.center()));
+  VERIFY_IS_APPROX(b0.center(),(p0+p1)/Scalar(2));
 
   (b2 = b0).extend(b1);
   VERIFY(b2.contains(b0));
diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index bf63c69ec..2187c7bf9 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -58,6 +58,8 @@ template<typename Scalar,int Size> void homogeneous(void)
   T2MatrixType t2 = T2MatrixType::Random();
   VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous());
   VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous());
+  VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal());
+  VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2);
 
   VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2,
                     v0.transpose().rowwise().homogeneous() * t2);
@@ -109,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
   
   VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
   VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
+
+  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 
 void test_geo_homogeneous()
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index c1cc691c9..e77702bc7 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -97,9 +97,9 @@ template<typename Scalar> void lines()
     Vector u = Vector::Random();
     Vector v = Vector::Random();
     Scalar a = internal::random<Scalar>();
-    while (abs(a-1) < 1e-4) a = internal::random<Scalar>();
-    while (u.norm() < 1e-4) u = Vector::Random();
-    while (v.norm() < 1e-4) v = Vector::Random();
+    while (abs(a-1) < Scalar(1e-4)) a = internal::random<Scalar>();
+    while (u.norm() < Scalar(1e-4)) u = Vector::Random();
+    while (v.norm() < Scalar(1e-4)) v = Vector::Random();
 
     HLine line_u = HLine::Through(center + u, center + a*u);
     HLine line_v = HLine::Through(center + v, center + a*v);
@@ -111,14 +111,14 @@ template<typename Scalar> void lines()
     Vector result = line_u.intersection(line_v);
 
     // the lines should intersect at the point we called "center"
-    if(abs(a-1) > 1e-2 && abs(v.normalized().dot(u.normalized()))<0.9)
+    if(abs(a-1) > Scalar(1e-2) && abs(v.normalized().dot(u.normalized()))<Scalar(0.9))
       VERIFY_IS_APPROX(result, center);
 
     // check conversions between two types of lines
     PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
     HLine line_u2(pl);
     CoeffsType converted_coeffs = line_u2.coeffs();
-    if(line_u2.normal().dot(line_u.normal())<0.)
+    if(line_u2.normal().dot(line_u.normal())<Scalar(0))
       converted_coeffs = -line_u2.coeffs();
     VERIFY(line_u.coeffs().isApprox(converted_coeffs));
   }
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 761bb52b4..96889e722 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -30,8 +30,8 @@ template<typename QuatType> void check_slerp(const QuatType& q0, const QuatType&
   Scalar largeEps = test_precision<Scalar>();
 
   Scalar theta_tot = AA(q1*q0.inverse()).angle();
-  if(theta_tot>EIGEN_PI)
-    theta_tot = Scalar(2.*EIGEN_PI)-theta_tot;
+  if(theta_tot>Scalar(EIGEN_PI))
+    theta_tot = Scalar(2.)*Scalar(EIGEN_PI)-theta_tot;
   for(Scalar t=0; t<=Scalar(1.001); t+=Scalar(0.1))
   {
     QuatType q = q0.slerp(t,q1);
@@ -50,13 +50,12 @@ template<typename Scalar, int Options> void quaternion(void)
   using std::abs;
   typedef Matrix<Scalar,3,1> Vector3;
   typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,4,1> Vector4;
   typedef Quaternion<Scalar,Options> Quaternionx;
   typedef AngleAxis<Scalar> AngleAxisx;
 
   Scalar largeEps = test_precision<Scalar>();
   if (internal::is_same<Scalar,float>::value)
-    largeEps = 1e-3f;
+    largeEps = Scalar(1e-3);
 
   Scalar eps = internal::random<Scalar>() * Scalar(1e-2);
 
@@ -115,8 +114,8 @@ template<typename Scalar, int Options> void quaternion(void)
   // Do not execute the test if the rotation angle is almost zero, or
   // the rotation axis and v1 are almost parallel.
   if (abs(aa.angle()) > 5*test_precision<Scalar>()
-      && (aa.axis() - v1.normalized()).norm() < 1.99
-      && (aa.axis() + v1.normalized()).norm() < 1.99) 
+      && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
+      && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
   {
     VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
   }
@@ -157,8 +156,8 @@ template<typename Scalar, int Options> void quaternion(void)
   Quaternionx *q = new Quaternionx;
   delete q;
 
-  q1 = AngleAxisx(a, v0.normalized());
-  q2 = AngleAxisx(b, v1.normalized());
+  q1 = Quaternionx::UnitRandom();
+  q2 = Quaternionx::UnitRandom();
   check_slerp(q1,q2);
 
   q1 = AngleAxisx(b, v1.normalized());
@@ -169,7 +168,7 @@ template<typename Scalar, int Options> void quaternion(void)
   q2 = AngleAxisx(-b, -v1.normalized());
   check_slerp(q1,q2);
 
-  q1.coeffs() = Vector4::Random().normalized();
+  q1 = Quaternionx::UnitRandom();
   q2.coeffs() = -q1.coeffs();
   check_slerp(q1,q2);
 }
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 51f90036d..278e527c2 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -18,6 +18,11 @@ Matrix<T,2,1> angleToVec(T a)
   return Matrix<T,2,1>(std::cos(a), std::sin(a));
 }
 
+// This permits to workaround a bug in clang/llvm code generation.
+template<typename T>
+EIGEN_DONT_INLINE
+void dont_over_optimize(T& x) { volatile typename T::Scalar tmp = x(0); x(0) = tmp; }
+
 template<typename Scalar, int Mode, int Options> void non_projective_only()
 {
     /* this test covers the following files:
@@ -224,12 +229,13 @@ template<typename Scalar, int Mode, int Options> void transformations()
 
   do {
     v3 = Vector3::Random();
+    dont_over_optimize(v3);
   } while (v3.cwiseAbs().minCoeff()<NumTraits<Scalar>::epsilon());
   Translation3 tv3(v3);
   Transform3 t5(tv3);
   t4 = tv3;
   VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
-  t4.translate(-v3);
+  t4.translate((-v3).eval());
   VERIFY_IS_APPROX(t4.matrix(), MatrixType::Identity());
   t4 *= tv3;
   VERIFY_IS_APPROX(t5.matrix(), t4.matrix());
@@ -328,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
   t0.scale(v0);
   t1 *= AlignedScaling3(v0);
   VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
+  t1 = t1 * v0.asDiagonal();
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
   // transformation * translation
   t0.translate(v0);
   t1 = t1 * Translation3(v0);
@@ -466,7 +475,7 @@ template<typename Scalar, int Mode, int Options> void transformations()
     Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle();
     l += std::abs(a2-a1);
   }
-  VERIFY(l<=EIGEN_PI*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
+  VERIFY(l<=Scalar(EIGEN_PI)*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
   
   // check basic features
   {
@@ -476,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
     Rotation2D<Scalar> r2(r1);       // copy ctor
     VERIFY_IS_APPROX(r2.angle(),s0);
   }
+
+  {
+    Transform3 t32(Matrix4::Random()), t33, t34;
+    t34 = t33 = t32;
+    t32.scale(v0);
+    t33*=AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+    t33 = t34 * AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+  }
+
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
+  VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
+  VERIFY_IS_APPROX( q*(p*h).hnormalized(),  ((q*p)*h).hnormalized() );
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
+  VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
+  VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
+
+  transform_associativity_left(a1, a2,p, q, v, h);
+}
+
+template<typename Scalar, int Dim, int Options,typename RotationType>
+void transform_associativity(const RotationType& R)
+{
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar,Dim+1,1> HVectorType;
+  typedef Matrix<Scalar,Dim,Dim> LinearType;
+  typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
+  typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
+  typedef Transform<Scalar,Dim,Affine,Options> AffineType;
+  typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
+  typedef DiagonalMatrix<Scalar,Dim> ScalingType;
+  typedef Translation<Scalar,Dim> TranslationType;
+
+  AffineCompactType A1c; A1c.matrix().setRandom();
+  AffineCompactType A2c; A2c.matrix().setRandom();
+  AffineType A1(A1c);
+  AffineType A2(A2c);
+  ProjectiveType P1; P1.matrix().setRandom();
+  VectorType v1 = VectorType::Random();
+  VectorType v2 = VectorType::Random();
+  HVectorType h1 = HVectorType::Random();
+  Scalar s1 = internal::random<Scalar>();
+  LinearType L = LinearType::Random();
+  MatrixType M = MatrixType::Random();
+
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
+
+  VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
+  VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
+  VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
+
+  VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
+  VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
+  VERIFY_IS_APPROX( M*(P1*h1),  ((M*P1)*h1) );
 }
 
 template<typename Scalar> void transform_alignment()
@@ -556,5 +638,8 @@ void test_geo_transformations()
 
     CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
     CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
+
+    CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
+    CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
   }
 }
diff --git a/unsupported/test/cxx11_float16.cpp b/test/half_float.cpp
similarity index 72%
rename from unsupported/test/cxx11_float16.cpp
rename to test/half_float.cpp
index 9a813653c..f8d438e2f 100644
--- a/unsupported/test/cxx11_float16.cpp
+++ b/test/half_float.cpp
@@ -5,17 +5,23 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_float16
+#include <sstream>
 
 #include "main.h"
+
 #include <Eigen/src/Core/arch/CUDA/Half.h>
 
+// Make sure it's possible to forward declare Eigen::half
+namespace Eigen {
+struct half;
+}
+
 using Eigen::half;
 
 void test_conversion()
 {
+  using Eigen::half_impl::__half;
+
   // Conversion from float.
   VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
   VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
@@ -34,8 +40,8 @@ void test_conversion()
   float val1 = float(half(__half(0x3c00)));
   float val2 = float(half(__half(0x3c01)));
   float val3 = float(half(__half(0x3c02)));
-  VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02);
+  VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
 
   // Conversion from int.
   VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
@@ -88,6 +94,16 @@ void test_conversion()
 #endif
 }
 
+void test_numtraits()
+{
+  std::cout << "epsilon  = " << NumTraits<half>::epsilon() << std::endl;
+  std::cout << "highest  = " << NumTraits<half>::highest() << std::endl;
+  std::cout << "lowest   = " << NumTraits<half>::lowest() << std::endl;
+  std::cout << "inifinty = " << NumTraits<half>::infinity() << std::endl;
+  std::cout << "nan      = " << NumTraits<half>::quiet_NaN() << std::endl;
+
+}
+
 void test_arithmetic()
 {
   VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
@@ -140,53 +156,97 @@ void test_comparison()
 void test_basic_functions()
 {
   VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f);
   VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(float(abs(half(-3.5f))), 3.5f);
 
   VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(floor(half(3.5f))), 3.0f);
   VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
+  VERIFY_IS_EQUAL(float(floor(half(-3.5f))), -4.0f);
 
   VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(3.5f))), 4.0f);
   VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
+  VERIFY_IS_EQUAL(float(ceil(half(-3.5f))), -3.0f);
 
   VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(0.0f))), 0.0f);
   VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
+  VERIFY_IS_APPROX(float(sqrt(half(4.0f))), 2.0f);
 
   VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(pow(half(0.0f), half(1.0f))), 0.0f);
   VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
+  VERIFY_IS_APPROX(float(pow(half(2.0f), half(2.0f))), 4.0f);
 
   VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
-  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
+  VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+  VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
 
   VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log(half(1.0f))), 0.0f);
   VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
+  VERIFY_IS_APPROX(float(log(half(10.0f))), 2.30273f);
+
+  VERIFY_IS_EQUAL(float(numext::log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(log1p(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::log1p(half(10.0f))), 2.3978953f);
+  VERIFY_IS_APPROX(float(log1p(half(10.0f))), 2.3978953f);
 }
 
 void test_trigonometric_functions()
 {
   VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f)));
   VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
   //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
   //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
   VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
 
   VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
+  VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f)));
   //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
   VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
   VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
   VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
 
   VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
+  VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f)));
   //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
   //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
   //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
   VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
 }
 
-void test_cxx11_float16()
+void test_array()
+{
+  typedef Array<half,1,Dynamic> ArrayXh;
+  Index size = internal::random<Index>(1,10);
+  Index i = internal::random<Index>(0,size-1);
+  ArrayXh a1 = ArrayXh::Random(size), a2 = ArrayXh::Random(size);
+  VERIFY_IS_APPROX( a1+a1, half(2)*a1 );
+  VERIFY( (a1.abs() >= half(0)).all() );
+  VERIFY_IS_APPROX( (a1*a1).sqrt(), a1.abs() );
+
+  VERIFY( ((a1.min)(a2) <= (a1.max)(a2)).all() );
+  a1(i) = half(-10.);
+  VERIFY_IS_EQUAL( a1.minCoeff(), half(-10.) );
+  a1(i) = half(10.);
+  VERIFY_IS_EQUAL( a1.maxCoeff(), half(10.) );
+
+  std::stringstream ss;
+  ss << a1;
+}
+
+void test_half_float()
 {
   CALL_SUBTEST(test_conversion());
+  CALL_SUBTEST(test_numtraits());
   CALL_SUBTEST(test_arithmetic());
   CALL_SUBTEST(test_comparison());
   CALL_SUBTEST(test_basic_functions());
   CALL_SUBTEST(test_trigonometric_functions());
+  CALL_SUBTEST(test_array());
 }
diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp
new file mode 100644
index 000000000..92d0d91b6
--- /dev/null
+++ b/test/inplace_decomposition.cpp
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/LU>
+#include <Eigen/Cholesky>
+#include <Eigen/QR>
+
+// This file test inplace decomposition through Ref<>, as supported by Cholesky, LU, and QR decompositions.
+
+template<typename DecType,typename MatrixType> void inplace(bool square = false, bool SPD = false)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RhsType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ResType;
+
+  Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(2,EIGEN_TEST_MAX_SIZE/2) : Index(MatrixType::RowsAtCompileTime);
+  Index cols = MatrixType::ColsAtCompileTime==Dynamic ? (square?rows:internal::random<Index>(2,rows))    : Index(MatrixType::ColsAtCompileTime);
+
+  MatrixType A = MatrixType::Random(rows,cols);
+  RhsType b = RhsType::Random(rows);
+  ResType x(cols);
+
+  if(SPD)
+  {
+    assert(square);
+    A.topRows(cols) = A.topRows(cols).adjoint() * A.topRows(cols);
+    A.diagonal().array() += 1e-3;
+  }
+
+  MatrixType A0 = A;
+  MatrixType A1 = A;
+
+  DecType dec(A);
+
+  // Check that the content of A has been modified
+  VERIFY_IS_NOT_APPROX( A, A0 );
+
+  // Check that the decomposition is correct:
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that modifying A breaks the current dec:
+  A.setRandom();
+  if(rows==cols)
+  {
+    VERIFY_IS_NOT_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_NOT_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+
+  // Check that calling compute(A1) does not modify A1:
+  A = A0;
+  dec.compute(A1);
+  VERIFY_IS_EQUAL(A0,A1);
+  VERIFY_IS_NOT_APPROX( A, A0 );
+  if(rows==cols)
+  {
+    VERIFY_IS_APPROX( A0 * (x = dec.solve(b)), b );
+  }
+  else
+  {
+    VERIFY_IS_APPROX( A0.transpose() * A0 * (x = dec.solve(b)), A0.transpose() * b );
+  }
+}
+
+
+void test_inplace_decomposition()
+{
+  EIGEN_UNUSED typedef Matrix<double,4,3> Matrix43d;
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( inplace<LLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_1(( inplace<LLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<MatrixXd> >, MatrixXd>(true,true) ));
+    CALL_SUBTEST_2(( inplace<LDLT<Ref<Matrix4d> >, Matrix4d>(true,true) ));
+
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_3(( inplace<PartialPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<MatrixXd> >, MatrixXd>(true,false) ));
+    CALL_SUBTEST_4(( inplace<FullPivLU<Ref<Matrix4d> >, Matrix4d>(true,false) ));
+
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_5(( inplace<HouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_6(( inplace<ColPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_7(( inplace<FullPivHouseholderQR<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<MatrixXd> >, MatrixXd>(false,false) ));
+    CALL_SUBTEST_8(( inplace<CompleteOrthogonalDecomposition<Ref<Matrix43d> >, Matrix43d>(false,false) ));
+  }
+}
diff --git a/test/integer_types.cpp b/test/integer_types.cpp
index 950f8e9be..a21f73a81 100644
--- a/test/integer_types.cpp
+++ b/test/integer_types.cpp
@@ -158,4 +158,12 @@ void test_integer_types()
 
     CALL_SUBTEST_8( integer_type_tests(Matrix<unsigned long long, Dynamic, 5>(1, 5)) );
   }
+#ifdef EIGEN_TEST_PART_9
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<int>::value, 8);
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<unsigned int>::value, 8);
+  if(sizeof(long)>sizeof(int)) {
+    VERIFY(internal::scalar_div_cost<long>::value > internal::scalar_div_cost<int>::value);
+    VERIFY(internal::scalar_div_cost<unsigned long>::value > internal::scalar_div_cost<int>::value);
+  }
+#endif
 }
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
index 6d7904bac..2c7838ce9 100644
--- a/test/is_same_dense.cpp
+++ b/test/is_same_dense.cpp
@@ -9,6 +9,8 @@
 
 #include "main.h"
 
+using internal::is_same_dense;
+
 void test_is_same_dense()
 {
   typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 292f33969..17474af10 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -9,7 +9,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 static bool g_called;
-#define EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN { g_called = true; }
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
 
 #include "main.h"
 
@@ -21,6 +21,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   */
   typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
 
   Index rows = m.rows();
   Index cols = m.cols();
@@ -32,7 +33,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
              m3(rows, cols);
 
   Scalar s1 = internal::random<Scalar>();
-  while (abs(s1)<1e-3) s1 = internal::random<Scalar>();
+  while (abs(s1)<RealScalar(1e-3)) s1 = internal::random<Scalar>();
 
   Index r = internal::random<Index>(0, rows-1),
         c = internal::random<Index>(0, cols-1);
@@ -92,6 +93,22 @@ template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::Ro
   g_called = false;
   VERIFY_IS_APPROX(m1/s, m1/Scalar(s));
   VERIFY(g_called && "matrix<complex> / real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s+m1.array(), Scalar(s)+m1.array());
+  VERIFY(g_called && "real + matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()+s, m1.array()+Scalar(s));
+  VERIFY(g_called && "matrix<complex> + real not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(s-m1.array(), Scalar(s)-m1.array());
+  VERIFY(g_called && "real - matrix<complex> not properly optimized");
+
+  g_called = false;
+  VERIFY_IS_APPROX(m1.array()-s, m1.array()-Scalar(s));
+  VERIFY(g_called && "matrix<complex> - real not properly optimized");
 }
 
 void test_linearstructure()
diff --git a/test/main.h b/test/main.h
index b0e3b7818..74ff96a23 100644
--- a/test/main.h
+++ b/test/main.h
@@ -279,8 +279,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 #define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
 
 
-#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
-#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b))
+#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true))
+#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(test_is_equal(a, b, false))
 #define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b))
 #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b))
@@ -302,7 +302,7 @@ namespace Eigen {
 template<typename T> inline typename NumTraits<T>::Real test_precision() { return NumTraits<T>::dummy_precision(); }
 template<> inline float test_precision<float>() { return 1e-3f; }
 template<> inline double test_precision<double>() { return 1e-6; }
-template<> inline long double test_precision<long double>() { return 1e-6; }
+template<> inline long double test_precision<long double>() { return 1e-6l; }
 template<> inline float test_precision<std::complex<float> >() { return test_precision<float>(); }
 template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
 template<> inline long double test_precision<std::complex<long double> >() { return test_precision<long double>(); }
@@ -452,20 +452,20 @@ T test_relative_error(const AngleAxis<T> &a, const AngleAxis<T> &b)
 }
 
 template<typename Type1, typename Type2>
-inline bool test_isApprox(const Type1& a, const Type2& b)
+inline bool test_isApprox(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
 {
   return a.isApprox(b, test_precision<typename Type1::Scalar>());
 }
 
 // get_test_precision is a small wrapper to test_precision allowing to return the scalar precision for either scalars or expressions
 template<typename T>
-typename NumTraits<typename T::Scalar>::Real get_test_precision(const typename T::Scalar* = 0)
+typename NumTraits<typename T::Scalar>::Real get_test_precision(const T&, const typename T::Scalar* = 0)
 {
   return test_precision<typename NumTraits<typename T::Scalar>::Real>();
 }
 
 template<typename T>
-typename NumTraits<T>::Real get_test_precision(typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T>::Real>::value, T>::type* = 0)
+typename NumTraits<T>::Real get_test_precision(const T&,typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T>::Real>::value, T>::type* = 0)
 {
   return test_precision<typename NumTraits<T>::Real>();
 }
@@ -477,7 +477,7 @@ inline bool verifyIsApprox(const Type1& a, const Type2& b)
   bool ret = test_isApprox(a,b);
   if(!ret)
   {
-    std::cerr << "Difference too large wrt tolerance " << get_test_precision<Type1>()  << ", relative error is: " << test_relative_error(a,b) << std::endl;
+    std::cerr << "Difference too large wrt tolerance " << get_test_precision(a)  << ", relative error is: " << test_relative_error(a,b) << std::endl;
   }
   return ret;
 }
@@ -517,17 +517,17 @@ inline bool test_isUnitary(const MatrixBase<Derived>& m)
 
 // Forward declaration to avoid ICC warning
 template<typename T, typename U>
-bool test_is_equal(const T& actual, const U& expected);
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal=true);
 
 template<typename T, typename U>
-bool test_is_equal(const T& actual, const U& expected)
+bool test_is_equal(const T& actual, const U& expected, bool expect_equal)
 {
-    if (actual==expected)
+    if ((actual==expected) == expect_equal)
         return true;
     // false:
     std::cerr
-        << std::endl << "    actual   = " << actual
-        << std::endl << "    expected = " << expected << std::endl << std::endl;
+        << "\n    actual   = " << actual
+        << "\n    expected " << (expect_equal ? "= " : "!=") << expected << "\n\n";
     return false;
 }
 
@@ -736,3 +736,8 @@ int main(int argc, char *argv[])
   // remark #1572: floating-point equality and inequality comparisons are unreliable
   #pragma warning disable 279 383 1418 1572
 #endif
+
+#ifdef _MSC_VER
+  // 4503 - decorated name length exceeded, name was truncated
+  #pragma warning( disable : 4503)
+#endif
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index 88653e887..6a84c5897 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -25,7 +25,7 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = (std::size_t(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
   Scalar  array4[EIGEN_TESTMAP_MAX_SIZE];
 
   Map<VectorType, AlignedMax>(array1, size) = VectorType::Random(size);
@@ -65,7 +65,7 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
   // array3unaligned -> unaligned pointer to heap
   Scalar* array3 = new Scalar[size+1];
   for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
-  Scalar* array3unaligned = size_t(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
   Scalar array4[256];
   if(size<=256)
     for(int i = 0; i < size; i++) array4[i] = Scalar(1);
@@ -129,7 +129,7 @@ template<typename VectorType> void map_static_methods(const VectorType& m)
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
-  Scalar* array3unaligned = size_t(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
 
   VectorType::MapAligned(array1, size) = VectorType::Random(size);
   VectorType::Map(array2, size) = VectorType::Map(array1, size);
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index ee2414248..4858f8fea 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -23,7 +23,7 @@ template<int Alignment,typename VectorType> void map_class_vector(const VectorTy
   Scalar* a_array = internal::aligned_new<Scalar>(arraysize+1);
   Scalar* array = a_array;
   if(Alignment!=Aligned)
-    array = (Scalar*)(ptrdiff_t(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array = (Scalar*)(internal::IntPtr(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
 
   {
     Map<VectorType, Alignment, InnerStride<3> > map(array, size);
@@ -63,14 +63,14 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
   Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
   Scalar* array1 = a_array1;
   if(Alignment!=Aligned)
-    array1 = (Scalar*)(std::ptrdiff_t(a_array1) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array1 = (Scalar*)(internal::IntPtr(a_array1) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
 
   Scalar a_array2[256];
   Scalar* array2 = a_array2;
   if(Alignment!=Aligned)
-    array2 = (Scalar*)(std::ptrdiff_t(a_array2) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array2 = (Scalar*)(internal::IntPtr(a_array2) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
   else
-    array2 = (Scalar*)(((std::size_t(a_array2)+EIGEN_MAX_ALIGN_BYTES-1)/EIGEN_MAX_ALIGN_BYTES)*EIGEN_MAX_ALIGN_BYTES);
+    array2 = (Scalar*)(((internal::UIntPtr(a_array2)+EIGEN_MAX_ALIGN_BYTES-1)/EIGEN_MAX_ALIGN_BYTES)*EIGEN_MAX_ALIGN_BYTES);
   Index maxsize2 = a_array2 - array2 + 256;
   
   // test no inner stride and some dynamic outer stride
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 0b381ec6c..ad9c2c652 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -23,10 +23,18 @@
 
 #endif
 
+static bool g_called;
+#define EIGEN_SCALAR_BINARY_OP_PLUGIN { g_called |= (!internal::is_same<LhsScalar,RhsScalar>::value); }
+
 #include "main.h"
 
 using namespace std;
 
+#define VERIFY_MIX_SCALAR(XPR,REF) \
+  g_called = false; \
+  VERIFY_IS_APPROX(XPR,REF); \
+  VERIFY( g_called && #XPR" not properly optimized");
+
 template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 {
   typedef std::complex<float>   CF;
@@ -42,6 +50,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 
   Mat_f mf    = Mat_f::Random(size,size);
   Mat_d md    = mf.template cast<double>();
+  //Mat_d rd    = md;
   Mat_cf mcf  = Mat_cf::Random(size,size);
   Mat_cd mcd  = mcf.template cast<complex<double> >();
   Mat_cd rcd = mcd;
@@ -54,25 +63,59 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   complex<float>  scf = internal::random<complex<float> >();
   complex<double> scd = internal::random<complex<double> >();
 
-
   mf+mf;
-  VERIFY_RAISES_ASSERT(mf+md);
-#ifndef EIGEN_HAS_STD_RESULT_OF
-  // this one does not even compile with C++11
-  VERIFY_RAISES_ASSERT(mf+mcf);
-#endif
+
+  float  epsf = std::sqrt(std::numeric_limits<float> ::min EIGEN_EMPTY ());
+  double epsd = std::sqrt(std::numeric_limits<double>::min EIGEN_EMPTY ());
+
+  while(std::abs(sf )<epsf) sf  = internal::random<float>();
+  while(std::abs(sd )<epsd) sf  = internal::random<double>();
+  while(std::abs(scf)<epsf) scf = internal::random<CF>();
+  while(std::abs(scd)<epsd) scd = internal::random<CD>();
+
+//   VERIFY_RAISES_ASSERT(mf+md); // does not even compile
 
 #ifdef EIGEN_DONT_VECTORIZE
   VERIFY_RAISES_ASSERT(vf=vd);
   VERIFY_RAISES_ASSERT(vf+=vd);
-  VERIFY_RAISES_ASSERT(mcd=md);
 #endif
   
   // check scalar products
-  VERIFY_IS_APPROX(vcf * sf , vcf * complex<float>(sf));
-  VERIFY_IS_APPROX(sd * vcd, complex<double>(sd) * vcd);
-  VERIFY_IS_APPROX(vf * scf , vf.template cast<complex<float> >() * scf);
-  VERIFY_IS_APPROX(scd * vd, scd * vd.template cast<complex<double> >());
+  VERIFY_MIX_SCALAR(vcf * sf , vcf * complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd * vcd , complex<double>(sd) * vcd);
+  VERIFY_MIX_SCALAR(vf * scf , vf.template cast<complex<float> >() * scf);
+  VERIFY_MIX_SCALAR(scd * vd , scd * vd.template cast<complex<double> >());
+
+  VERIFY_MIX_SCALAR(vcf * 2 , vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(vcf * 2.1 , vcf * complex<float>(2.1));
+  VERIFY_MIX_SCALAR(2 * vcf, vcf * complex<float>(2));
+  VERIFY_MIX_SCALAR(2.1 * vcf , vcf * complex<float>(2.1));
+
+  // check scalar quotients
+  VERIFY_MIX_SCALAR(vcf / sf , vcf / complex<float>(sf));
+  VERIFY_MIX_SCALAR(vf / scf , vf.template cast<complex<float> >() / scf);
+  VERIFY_MIX_SCALAR(vf.array()  / scf, vf.template cast<complex<float> >().array() / scf);
+  VERIFY_MIX_SCALAR(scd / vd.array() , scd / vd.template cast<complex<double> >().array());
+
+  // check scalar increment
+  VERIFY_MIX_SCALAR(vcf.array() + sf , vcf.array() + complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  + vcd.array(), complex<double>(sd) + vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  + scf, vf.template cast<complex<float> >().array() + scf);
+  VERIFY_MIX_SCALAR(scd + vd.array() , scd + vd.template cast<complex<double> >().array());
+
+  // check scalar subtractions
+  VERIFY_MIX_SCALAR(vcf.array() - sf , vcf.array() - complex<float>(sf));
+  VERIFY_MIX_SCALAR(sd  - vcd.array(), complex<double>(sd) - vcd.array());
+  VERIFY_MIX_SCALAR(vf.array()  - scf, vf.template cast<complex<float> >().array() - scf);
+  VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast<complex<double> >().array());
+
+  // check scalar powers
+  VERIFY_MIX_SCALAR( pow(vcf.array(), sf),        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( vcf.array().pow(sf) ,        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_MIX_SCALAR( pow(sd, vcd.array()),        Eigen::pow(complex<double>(sd), vcd.array()) );
+  VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( vf.array().pow(scf) ,        Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast<complex<double> >().array()) );
 
   // check dot product
   vf.dot(vf);
@@ -184,6 +227,63 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
                    Mat_cd((scd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
   VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * md * mcd),
                    Mat_cd((scd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+
+
+  VERIFY_IS_APPROX( md.array()  * mcd.array(), md.template cast<CD>().eval().array() * mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() * md.array(),  mcd.array() * md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  + mcd.array(), md.template cast<CD>().eval().array() + mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() + md.array(),  mcd.array() + md.template cast<CD>().eval().array() );
+
+  VERIFY_IS_APPROX( md.array()  - mcd.array(), md.template cast<CD>().eval().array() - mcd.array() );
+  VERIFY_IS_APPROX( mcd.array() - md.array(),  mcd.array() - md.template cast<CD>().eval().array() );
+
+  if(mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array() / mcd.array(), md.template cast<CD>().eval().array() / mcd.array() );
+  }
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( mcd.array() / md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  if(md.array().abs().minCoeff()>epsd || mcd.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( md.array().pow(mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( mcd.array().pow(md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+
+    VERIFY_IS_APPROX( pow(md.array(),mcd.array()), md.template cast<CD>().eval().array().pow(mcd.array()) );
+    VERIFY_IS_APPROX( pow(mcd.array(),md.array()),  mcd.array().pow(md.template cast<CD>().eval().array()) );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd = md, md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd += md, mcd + md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd -= md, mcd - md.template cast<CD>().eval() );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.array() *= md.array(), mcd.array() * md.template cast<CD>().eval().array() );
+  rcd = mcd;
+  if(md.array().abs().minCoeff()>epsd)
+  {
+    VERIFY_IS_APPROX( rcd.array() /= md.array(), mcd.array() / md.template cast<CD>().eval().array() );
+  }
+
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md + mcd*md, mcd + (md.template cast<CD>().eval()) + mcd*(md.template cast<CD>().eval()));
+
+  VERIFY_IS_APPROX( rcd.noalias()  = md*md,       ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += md*md, mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= md*md, mcd - ((md*md).eval().template cast<CD>()) );
+
+  VERIFY_IS_APPROX( rcd.noalias()  = mcd + md*md,       mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() += mcd + md*md, mcd + mcd + ((md*md).eval().template cast<CD>()) );
+  rcd = mcd;
+  VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md,           - ((md*md).eval().template cast<CD>()) );
 }
 
 void test_mixingtypes()
diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp
index 2f5025305..a419b0e44 100644
--- a/test/nesting_ops.cpp
+++ b/test/nesting_ops.cpp
@@ -75,8 +75,8 @@ template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
     }
     else
     {
-      VERIFY( verify_eval_type<1>(2*m1, 2*m1) );
-      VERIFY( verify_eval_type<2>(2*m1, m1) );
+      VERIFY( verify_eval_type<2>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<3>(2*m1, m1) );
     }
     VERIFY( verify_eval_type<2>(m1+m1, m1+m1) );
     VERIFY( verify_eval_type<3>(m1+m1, m1) );
diff --git a/test/nullary.cpp b/test/nullary.cpp
index cb87695ee..9063c6de8 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -104,13 +104,29 @@ void testVectorType(const VectorType& base)
 template<typename MatrixType>
 void testMatrixType(const MatrixType& m)
 {
+  using std::abs;
   const Index rows = m.rows();
   const Index cols = m.cols();
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+  Scalar s1;
+  do {
+    s1 = internal::random<Scalar>();
+  } while(abs(s1)<RealScalar(1e-5) && (!NumTraits<Scalar>::IsInteger));
 
   MatrixType A;
   A.setIdentity(rows, cols);
   VERIFY(equalsIdentity(A));
   VERIFY(equalsIdentity(MatrixType::Identity(rows, cols)));
+
+
+  A = MatrixType::Constant(rows,cols,s1);
+  Index i = internal::random<Index>(0,rows-1);
+  Index j = internal::random<Index>(0,cols-1);
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1)(i,j), s1 );
+  VERIFY_IS_APPROX( MatrixType::Constant(rows,cols,s1).coeff(i,j), s1 );
+  VERIFY_IS_APPROX( A(i,j), s1 );
 }
 
 void test_nullary()
@@ -137,4 +153,47 @@ void test_nullary()
   // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
   VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
 #endif
+
+#ifdef EIGEN_TEST_PART_10
+  // check some internal logic
+  VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<double> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::has_unary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY((  internal::has_binary_operator<internal::scalar_identity_op<double> >::value ));
+  VERIFY(( !internal::functor_has_linear_access<internal::scalar_identity_op<double> >::ret ));
+
+  VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<float,float,false> >::value ));
+  VERIFY((  internal::has_unary_operator<internal::linspaced_op<float,float,false> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::linspaced_op<float,float,false> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<float,float,false> >::ret ));
+
+  // Regression unit test for a weird MSVC bug.
+  // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details.
+  // See also traits<Ref>::match.
+  {
+    MatrixXf A = MatrixXf::Random(3,3);
+    Ref<const MatrixXf> R = 2.0*A;
+    VERIFY_IS_APPROX(R, A+A);
+
+    Ref<const MatrixXf> R1 = MatrixXf::Random(3,3)+A;
+
+    VectorXi V = VectorXi::Random(3);
+    Ref<const VectorXi> R2 = VectorXi::LinSpaced(3,1,3)+V;
+    VERIFY_IS_APPROX(R2, V+Vector3i(1,2,3));
+
+    VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<float> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<float> >::ret ));
+
+    VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<int,int,false> >::value ));
+    VERIFY((  internal::has_unary_operator<internal::linspaced_op<int,int,false> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int,int,false> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int,int,false> >::ret ));
+  }
+#endif
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index c2346e1cd..20addf1ad 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -9,7 +9,11 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "unsupported/Eigen/SpecialFunctions"
 
+#if defined __GNUC__ && __GNUC__>=6
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
 // using namespace Eigen;
 
 namespace Eigen {
@@ -368,7 +372,15 @@ template<typename Scalar> void packetmath_real()
     VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
   }
 
-#ifdef EIGEN_HAS_C99_MATH
+  if (PacketTraits::HasTanh) {
+    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
+    h.store(data2, internal::ptanh(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+
+#if EIGEN_HAS_C99_MATH
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
     packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
@@ -395,11 +407,12 @@ template<typename Scalar> void packetmath_real()
     data2[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
   }
 
-  if(internal::random<float>(0,1)<0.1)
+  if(internal::random<float>(0,1)<0.1f)
     data1[internal::random<int>(0, PacketSize)] = 0;
   CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
   CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
-#if defined(EIGEN_HAS_C99_MATH) && (__cplusplus > 199711L)
+#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
+  CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
@@ -432,7 +445,7 @@ template<typename Scalar> void packetmath_real()
     // VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
     VERIFY((numext::isnan)(data2[1]));
 
-    data1[0] = -1.0f;
+    data1[0] = Scalar(-1.0f);
     h.store(data2, internal::plog(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
 #if !EIGEN_FAST_MATH
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index c4ef2d4bd..eb6ad18c9 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -53,14 +53,29 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
    // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21.
   VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
   VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
+
+  {
+    int s = 5;//internal::random<int>(4,10);
+    int i = 0;//internal::random<int>(0,s-4);
+    int j = 0;//internal::random<int>(0,s-4);
+    Matrix<Scalar,5,5> mat(s,s);
+    mat.setRandom();
+    MatrixType submat = mat.template block<4,4>(i,j);
+    MatrixType mat_inv = mat.template block<4,4>(i,j).inverse();
+    VERIFY_IS_APPROX(mat_inv, submat.inverse());
+    mat.template block<4,4>(i,j) = submat.inverse();
+    VERIFY_IS_APPROX(mat_inv, (mat.template block<4,4>(i,j)));
+  }
 }
 
 void test_prec_inverse_4x4()
 {
   CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
   CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
+  CALL_SUBTEST_1(( inverse_general_4x4<Matrix<float,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_2((inverse_permutation_4x4<Matrix<double,4,4,RowMajor> >()));
+  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,ColMajor> >(200000 * g_repeat) ));
   CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_3((inverse_permutation_4x4<Matrix4cf>()));
diff --git a/test/product.h b/test/product.h
index 27976a4ae..3b6511270 100644
--- a/test/product.h
+++ b/test/product.h
@@ -119,6 +119,14 @@ template<typename MatrixType> void product(const MatrixType& m)
   res.noalias() -= square + m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
 
+  // test d ?= a-b*c rules
+  res.noalias() = square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+  res.noalias() += square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square - m1 * m2.transpose()));
+  res.noalias() -= square - m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - m1 * m2.transpose());
+
 
   tm1 = m1;
   VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
@@ -160,6 +168,29 @@ template<typename MatrixType> void product(const MatrixType& m)
     VERIFY_IS_APPROX(res2.block(0,0,1,cols).noalias() = m1.block(0,0,1,cols) * square2,                       (ref2.row(0) = m1.row(0) * square2));
   }
 
+  // vector.block() (see bug 1283)
+  {
+    RowVectorType w1(rows);
+    VERIFY_IS_APPROX(square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.noalias() = square * v1.block(0,0,rows,1), square * v1);
+    VERIFY_IS_APPROX(w1.block(0,0,rows,1).noalias() = square * v1.block(0,0,rows,1), square * v1);
+
+    Matrix<Scalar,1,MatrixType::ColsAtCompileTime> w2(cols);
+    VERIFY_IS_APPROX(vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = vc2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,1,cols).transpose();
+    VERIFY_IS_APPROX(square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,1,cols) * square2, vc2.transpose() * square2);
+
+    vc2 = square2.block(0,0,cols,1);
+    VERIFY_IS_APPROX(square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+    VERIFY_IS_APPROX(w2.block(0,0,1,cols).noalias() = square2.block(0,0,cols,1).transpose() * square2, vc2.transpose() * square2);
+  }
+
   // inner product
   {
     Scalar x = square2.row(c) * square2.col(c2);
@@ -196,4 +227,5 @@ template<typename MatrixType> void product(const MatrixType& m)
     VERIFY_IS_APPROX(square * (s1*(square*square)), s1 * square * square * square);
     VERIFY_IS_APPROX(square * (square*square).conjugate(), square * square.conjugate() * square.conjugate());
   }
+
 }
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index d253fd7ed..e4990ac8c 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -256,6 +256,51 @@ Index compute_block_size()
   return ret;
 }
 
+
+
+template<int>
+void bug_1308()
+{
+  int n = 10;
+  MatrixXd r(n,n);
+  VectorXd v = VectorXd::Random(n);
+  r = v * RowVectorXd::Ones(n);
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
+  r = VectorXd::Ones(n) * v.transpose();
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+
+  Matrix4d ones44 = Matrix4d::Ones();
+  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
+  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+//   RowVector4d r4;
+  m44.setOnes();
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
+}
+
 void test_product_extra()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -268,8 +313,10 @@ void test_product_extra()
   }
   CALL_SUBTEST_5( bug_127<0>() );
   CALL_SUBTEST_5( bug_817<0>() );
+  CALL_SUBTEST_5( bug_1308<0>() );
   CALL_SUBTEST_6( unaligned_objects<0>() );
   CALL_SUBTEST_7( compute_block_size<float>() );
   CALL_SUBTEST_7( compute_block_size<double>() );
   CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
+
 }
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 5a3f3a01a..2bb19a681 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -56,6 +56,9 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() =  m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 - m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 - m1 * m2.transpose(), 0);
 
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * m2.adjoint(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * (m1*s3+m2*s2).adjoint(), 1);
diff --git a/test/product_small.cpp b/test/product_small.cpp
index c35db6f65..fdfdd9f6c 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/LU>
 
 // regression test for bug 447
+template<int>
 void product1x1()
 {
   Matrix<float,1,3> matAstatic;
@@ -177,15 +178,66 @@ void test_lazy_l3()
   CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,-1>(4,cols,depth) ));
 }
 
+template<typename T,int N,int M,int K>
+void test_linear_but_not_vectorizable()
+{
+  // Check tricky cases for which the result of the product is a vector and thus must exhibit the LinearBit flag,
+  // but is not vectorizable along the linear dimension.
+  Index n = N==Dynamic ? internal::random<Index>(1,32) : N;
+  Index m = M==Dynamic ? internal::random<Index>(1,32) : M;
+  Index k = K==Dynamic ? internal::random<Index>(1,32) : K;
+
+  {
+    Matrix<T,N,M+1> A; A.setRandom(n,m+1);
+    Matrix<T,M*2,K> B; B.setRandom(m*2,k);
+    Matrix<T,1,K> C;
+    Matrix<T,1,K> R;
+
+    C.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>());
+    R.noalias() = A.template topLeftCorner<1,M>() * (B.template topRows<M>()+B.template bottomRows<M>()).eval();
+    VERIFY_IS_APPROX(C,R);
+  }
+
+  {
+    Matrix<T,M+1,N,RowMajor> A; A.setRandom(m+1,n);
+    Matrix<T,K,M*2,RowMajor> B; B.setRandom(k,m*2);
+    Matrix<T,K,1> C;
+    Matrix<T,K,1> R;
+
+    C.noalias() = (B.template leftCols<M>()+B.template rightCols<M>())        * A.template topLeftCorner<M,1>();
+    R.noalias() = (B.template leftCols<M>()+B.template rightCols<M>()).eval() * A.template topLeftCorner<M,1>();
+    VERIFY_IS_APPROX(C,R);
+  }
+}
+
+template<int Rows>
+void bug_1311()
+{
+  Matrix< double, Rows, 2 > A;  A.setRandom();
+  Vector2d b = Vector2d::Random() ;
+  Matrix<double,Rows,1> res;
+  res.noalias() = 1. * (A * b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = 1.*A * b;
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+}
+
 void test_product_small()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
+    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
     CALL_SUBTEST_3( product(Matrix3d()) );
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );
-    CALL_SUBTEST_6( product1x1() );
+    CALL_SUBTEST_6( product1x1<0>() );
 
     CALL_SUBTEST_11( test_lazy_l1<float>() );
     CALL_SUBTEST_12( test_lazy_l2<float>() );
@@ -202,6 +254,13 @@ void test_product_small()
     CALL_SUBTEST_41( test_lazy_l1<std::complex<double> >() );
     CALL_SUBTEST_42( test_lazy_l2<std::complex<double> >() );
     CALL_SUBTEST_43( test_lazy_l3<std::complex<double> >() );
+
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
+    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
+
+    CALL_SUBTEST_6( bug_1311<3>() );
+    CALL_SUBTEST_6( bug_1311<5>() );
   }
 
 #ifdef EIGEN_TEST_PART_6
diff --git a/test/qr.cpp b/test/qr.cpp
index 98738777f..dfcc1e8f9 100644
--- a/test/qr.cpp
+++ b/test/qr.cpp
@@ -86,7 +86,7 @@ template<typename MatrixType> void qr_invertible()
   VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
   // This test is tricky if the determinant becomes too small.
   // Since we generate random numbers with magnitude rrange [0,1], the average determinant is 0.5^size
-  VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), (max)(RealScalar(pow(0.5,size)),(max)(abs(absdet),abs(qr.absDeterminant()))) );
+  VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi<RealScalar>(abs(absdet),abs(qr.absDeterminant()))) );
   
 }
 
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 46c54b74f..057bb014c 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -93,6 +93,7 @@ void cod_fixedsize() {
 
 template<typename MatrixType> void qr()
 {
+  using std::sqrt;
   typedef typename MatrixType::Index Index;
 
   Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
@@ -120,14 +121,14 @@ template<typename MatrixType> void qr()
   // Verify that the absolute value of the diagonal elements in R are
   // non-increasing until they reach the singularity threshold.
   RealScalar threshold =
-      std::sqrt(RealScalar(rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
+      sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
   for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
-    RealScalar x = (std::abs)(r(i, i));
-    RealScalar y = (std::abs)(r(i + 1, i + 1));
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
     if (x < threshold && y < threshold) continue;
     if (!test_isApproxOrLessThan(y, x)) {
       for (Index j = 0; j < (std::min)(rows, cols); ++j) {
-        std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl;
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
       }
       std::cout << "Failure at i=" << i << ", rank=" << rank
                 << ", threshold=" << threshold << std::endl;
@@ -144,6 +145,8 @@ template<typename MatrixType> void qr()
 
 template<typename MatrixType, int Cols2> void qr_fixedsize()
 {
+  using std::sqrt;
+  using std::abs;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
@@ -169,14 +172,14 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   // Verify that the absolute value of the diagonal elements in R are
   // non-increasing until they reache the singularity threshold.
   RealScalar threshold =
-      std::sqrt(RealScalar(Rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
+      sqrt(RealScalar(Rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
   for (Index i = 0; i < (std::min)(int(Rows), int(Cols)) - 1; ++i) {
-    RealScalar x = (std::abs)(r(i, i));
-    RealScalar y = (std::abs)(r(i + 1, i + 1));
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
     if (x < threshold && y < threshold) continue;
     if (!test_isApproxOrLessThan(y, x)) {
       for (Index j = 0; j < (std::min)(int(Rows), int(Cols)); ++j) {
-        std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl;
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
       }
       std::cout << "Failure at i=" << i << ", rank=" << rank
                 << ", threshold=" << threshold << std::endl;
@@ -194,6 +197,8 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
 // page 3 for more detail.
 template<typename MatrixType> void qr_kahan_matrix()
 {
+  using std::sqrt;
+  using std::abs;
   typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
@@ -204,23 +209,25 @@ template<typename MatrixType> void qr_kahan_matrix()
   m1.setZero(rows,cols);
   RealScalar s = std::pow(NumTraits<RealScalar>::epsilon(), 1.0 / rows);
   RealScalar c = std::sqrt(1 - s*s);
+  RealScalar pow_s_i(1.0); // pow(s,i)
   for (Index i = 0; i < rows; ++i) {
-    m1(i, i) = pow(s, i);
-    m1.row(i).tail(rows - i - 1) = -pow(s, i) * c * MatrixType::Ones(1, rows - i - 1);
+    m1(i, i) = pow_s_i;
+    m1.row(i).tail(rows - i - 1) = -pow_s_i * c * MatrixType::Ones(1, rows - i - 1);
+    pow_s_i *= s;
   }
   m1 = (m1 + m1.transpose()).eval();
   ColPivHouseholderQR<MatrixType> qr(m1);
   MatrixType r = qr.matrixQR().template triangularView<Upper>();
 
   RealScalar threshold =
-      std::sqrt(RealScalar(rows)) * (std::abs)(r(0, 0)) * NumTraits<Scalar>::epsilon();
+      std::sqrt(RealScalar(rows)) * numext::abs(r(0, 0)) * NumTraits<Scalar>::epsilon();
   for (Index i = 0; i < (std::min)(rows, cols) - 1; ++i) {
-    RealScalar x = (std::abs)(r(i, i));
-    RealScalar y = (std::abs)(r(i + 1, i + 1));
+    RealScalar x = numext::abs(r(i, i));
+    RealScalar y = numext::abs(r(i + 1, i + 1));
     if (x < threshold && y < threshold) continue;
     if (!test_isApproxOrLessThan(y, x)) {
       for (Index j = 0; j < (std::min)(rows, cols); ++j) {
-        std::cout << "i = " << j << ", |r_ii| = " << (std::abs)(r(j, j)) << std::endl;
+        std::cout << "i = " << j << ", |r_ii| = " << numext::abs(r(j, j)) << std::endl;
       }
       std::cout << "Failure at i=" << i << ", rank=" << qr.rank()
                 << ", threshold=" << threshold << std::endl;
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index d82e123d0..05a705887 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -15,8 +15,12 @@ template<typename MatrixType> void qr()
 {
   typedef typename MatrixType::Index Index;
 
-  Index rows = internal::random<Index>(20,200), cols = internal::random<int>(20,200), cols2 = internal::random<int>(20,200);
-  Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
+  Index max_size = EIGEN_TEST_MAX_SIZE;
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index rows  = internal::random<Index>(min_size,max_size),
+        cols  = internal::random<Index>(min_size,max_size),
+        cols2 = internal::random<Index>(min_size,max_size),
+        rank  = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
@@ -59,7 +63,9 @@ template<typename MatrixType> void qr_invertible()
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   typedef typename MatrixType::Scalar Scalar;
 
-  int size = internal::random<int>(10,50);
+  Index max_size = numext::mini(50,EIGEN_TEST_MAX_SIZE);
+  Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
+  Index size = internal::random<Index>(min_size,max_size);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   m1 = MatrixType::Random(size,size);
diff --git a/test/rand.cpp b/test/rand.cpp
index eeec34191..51cf01773 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -9,6 +9,8 @@
 
 #include "main.h"
 
+typedef long long int64;
+
 template<typename Scalar> Scalar check_in_range(Scalar x, Scalar y)
 {
   Scalar r = internal::random<Scalar>(x,y);
@@ -35,31 +37,49 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
   VERIFY( (mask>0).all() );
 }
 
+template<typename Scalar> void check_histogram(Scalar x, Scalar y, int bins)
+{
+  Array<int,1,Dynamic> hist(bins);
+  hist.fill(0);
+  int f = 100000;
+  int n = bins*f;
+  int64 range = int64(y)-int64(x);
+  int divisor = int((range+1)/bins);
+  assert(((range+1)%bins)==0);
+  for(int k=0; k<n; ++k)
+  {
+    Scalar r = check_in_range(x,y);
+    hist( int((int64(r)-int64(x))/divisor) )++;
+  }
+  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.02).all() );
+}
+
 void test_rand()
 {
   long long_ref = NumTraits<long>::highest()/10;
   signed char char_offset = (std::min)(g_repeat,64);
   signed char short_offset = (std::min)(g_repeat,16000);
-  
-  for(int i = 0; i < g_repeat*10; i++) {
+
+  for(int i = 0; i < g_repeat*10000; i++) {
     CALL_SUBTEST(check_in_range<float>(10,11));
     CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
     CALL_SUBTEST(check_in_range<float>(-1,1));
     CALL_SUBTEST(check_in_range<float>(-1432.2352,-1432.2352));
-    
+
     CALL_SUBTEST(check_in_range<double>(10,11));
     CALL_SUBTEST(check_in_range<double>(1.24234523,1.24234523));
     CALL_SUBTEST(check_in_range<double>(-1,1));
     CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
-    
+
     CALL_SUBTEST(check_in_range<int>(0,-1));
     CALL_SUBTEST(check_in_range<short>(0,-1));
     CALL_SUBTEST(check_in_range<long>(0,-1));
     CALL_SUBTEST(check_in_range<int>(-673456,673456));
+    CALL_SUBTEST(check_in_range<int>(-RAND_MAX+10,RAND_MAX-10));
     CALL_SUBTEST(check_in_range<short>(-24345,24345));
     CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
   }
-  
+
   CALL_SUBTEST(check_all_in_range<signed char>(11,11));
   CALL_SUBTEST(check_all_in_range<signed char>(11,11+char_offset));
   CALL_SUBTEST(check_all_in_range<signed char>(-5,5));
@@ -67,25 +87,32 @@ void test_rand()
   CALL_SUBTEST(check_all_in_range<signed char>(-126,-126+char_offset));
   CALL_SUBTEST(check_all_in_range<signed char>(126-char_offset,126));
   CALL_SUBTEST(check_all_in_range<signed char>(-126,126));
-  
+
   CALL_SUBTEST(check_all_in_range<short>(11,11));
   CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
   CALL_SUBTEST(check_all_in_range<short>(-5,5));
   CALL_SUBTEST(check_all_in_range<short>(-11-short_offset,-11));
   CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
   CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
-  
+
   CALL_SUBTEST(check_all_in_range<int>(11,11));
   CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
   CALL_SUBTEST(check_all_in_range<int>(-5,5));
   CALL_SUBTEST(check_all_in_range<int>(-11-g_repeat,-11));
   CALL_SUBTEST(check_all_in_range<int>(-673456,-673456+g_repeat));
   CALL_SUBTEST(check_all_in_range<int>(673456,673456+g_repeat));
-  
+
   CALL_SUBTEST(check_all_in_range<long>(11,11));
   CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
   CALL_SUBTEST(check_all_in_range<long>(-5,5));
   CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
   CALL_SUBTEST(check_all_in_range<long>(-long_ref,-long_ref+g_repeat));
   CALL_SUBTEST(check_all_in_range<long>( long_ref, long_ref+g_repeat));
+
+  CALL_SUBTEST(check_histogram<int>(-5,5,11));
+  int bins = 100;
+  CALL_SUBTEST(check_histogram<int>(-3333,-3333+bins*(3333/bins)-1,bins));
+  bins = 1000;
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-RAND_MAX+10+bins*(RAND_MAX/bins)-1,bins));
+  CALL_SUBTEST(check_histogram<int>(-RAND_MAX+10,-int64(RAND_MAX)+10+bins*(2*int64(RAND_MAX)/bins)-1,bins));
 }
diff --git a/test/real_qz.cpp b/test/real_qz.cpp
index a1766c6d9..99ac31235 100644
--- a/test/real_qz.cpp
+++ b/test/real_qz.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_RUNTIME_NO_MALLOC
 #include "main.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
@@ -41,7 +42,11 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
     break;
   }
 
-  RealQZ<MatrixType> qz(A,B);
+  RealQZ<MatrixType> qz(dim);
+  // TODO enable full-prealocation of required memory, this probably requires an in-place mode for HessenbergDecomposition
+  //Eigen::internal::set_is_malloc_allowed(false);
+  qz.compute(A,B);
+  //Eigen::internal::set_is_malloc_allowed(true);
   
   VERIFY_IS_EQUAL(qz.info(), Success);
   // check for zeros
@@ -49,11 +54,20 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
   for (Index i=0; i<A.cols(); i++)
     for (Index j=0; j<i; j++) {
       if (abs(qz.matrixT()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: T(" << i << "," << j << ") = " << qz.matrixT()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << std::endl;
         all_zeros = false;
+      }
       if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0))
+      {
+        std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j)  << " && S(" << i-1 << "," << j-1 << ") = " << qz.matrixS()(i-1,j-1) << std::endl;
         all_zeros = false;
+      }
     }
   VERIFY_IS_EQUAL(all_zeros, true);
   VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A);
diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
index 3eebfc61b..8887f1b1b 100644
--- a/test/rvalue_types.cpp
+++ b/test/rvalue_types.cpp
@@ -11,7 +11,9 @@
 
 #include <Eigen/Core>
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+using internal::UIntPtr;
+
+#if EIGEN_HAS_RVALUE_REFERENCES
 template <typename MatrixType>
 void rvalue_copyassign(const MatrixType& m)
 {
@@ -20,11 +22,11 @@ void rvalue_copyassign(const MatrixType& m)
   
   // create a temporary which we are about to destroy by moving
   MatrixType tmp = m;
-  long src_address = reinterpret_cast<long>(tmp.data());
+  UIntPtr src_address = reinterpret_cast<UIntPtr>(tmp.data());
   
   // move the temporary to n
   MatrixType n = std::move(tmp);
-  long dst_address = reinterpret_cast<long>(n.data());
+  UIntPtr dst_address = reinterpret_cast<UIntPtr>(n.data());
 
   if (MatrixType::RowsAtCompileTime==Dynamic|| MatrixType::ColsAtCompileTime==Dynamic)
   {
diff --git a/test/schur_real.cpp b/test/schur_real.cpp
index cfe4570d4..4aede87df 100644
--- a/test/schur_real.cpp
+++ b/test/schur_real.cpp
@@ -82,7 +82,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   Atriangular.template triangularView<StrictlyLower>().setZero(); 
   rs3.setMaxIterations(1).compute(Atriangular); // triangular matrices do not need any iterations
   VERIFY_IS_EQUAL(rs3.info(), Success);
-  VERIFY_IS_EQUAL(rs3.matrixT(), Atriangular);
+  VERIFY_IS_APPROX(rs3.matrixT(), Atriangular); // approx because of scaling...
   VERIFY_IS_EQUAL(rs3.matrixU(), MatrixType::Identity(size, size));
 
   // Test computation of only T, not U
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index cb8ebaedf..7b5f3eb38 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -157,18 +157,15 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     initSparse<Scalar>(density, refM3, m3);
     initSparse<Scalar>(density, refM4, m4);
 
+    if(internal::random<bool>())
+      m1.makeCompressed();
+
     VERIFY_IS_APPROX(m1*s1, refM1*s1);
     VERIFY_IS_APPROX(m1+m2, refM1+refM2);
     VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3);
     VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2));
     VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2);
 
-    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
-    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
-
-    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
-    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
-
     if(SparseMatrixType::IsRowMajor)
       VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0)));
     else
@@ -197,11 +194,29 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
     VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
 
+    VERIFY_IS_APPROX(m1.sum(), refM1.sum());
+
+    VERIFY_IS_APPROX(m1*=s1, refM1*=s1);
+    VERIFY_IS_APPROX(m1/=s1, refM1/=s1);
+
+    VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
+    VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
+
     // test aliasing
     VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
     VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
     VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval()));
     VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1));
+
+    if(m1.isCompressed())
+    {
+      VERIFY_IS_APPROX(m1.coeffs().sum(), m1.sum());
+      m1.coeffs() += s1;
+      for(Index j = 0; j<m1.outerSize(); ++j)
+        for(typename SparseMatrixType::InnerIterator it(m1,j); it; ++it)
+          refM1(it.row(), it.col()) += s1;
+      VERIFY_IS_APPROX(m1, refM1);
+    }
   }
 
   // test transpose
@@ -232,11 +247,11 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       for (Index i=0; i<m2.rows(); ++i)
       {
         float x = internal::random<float>(0,1);
-        if (x<0.1)
+        if (x<0.1f)
         {
           // do nothing
         }
-        else if (x<0.5)
+        else if (x<0.5f)
         {
           countFalseNonZero++;
           m2.insert(i,j) = Scalar(0);
@@ -312,6 +327,17 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
       VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
     }
+
+    Index i = internal::random<Index>(0,rows-1);
+    Index j = internal::random<Index>(0,cols-1);
+    m2.coeffRef(i,j) = 123;
+    if(internal::random<bool>())
+      m2.makeCompressed();
+    Map<SparseMatrixType> mapMat2(rows, cols, m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(),  m2.innerNonZeroPtr());
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(123));
+    VERIFY_IS_EQUAL(mapMat2.coeff(i,j),Scalar(123));
+    mapMat2.coeffRef(i,j) = -123;
+    VERIFY_IS_EQUAL(m2.coeff(i,j),Scalar(-123));
   }
 
   // test triangularView
@@ -372,6 +398,12 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     SparseMatrixType m2(rows, rows);
     initSparse<Scalar>(density, refMat2, m2);
     VERIFY_IS_APPROX(m2.eval(), refMat2.sparseView().eval());
+
+    // sparse view on expressions:
+    VERIFY_IS_APPROX((s1*m2).eval(), (s1*refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2+m2).eval(), (refMat2+refMat2).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2.lazyProduct(refMat2)).sparseView().eval());
+    VERIFY_IS_APPROX((m2*m2).eval(), (refMat2*refMat2).sparseView().eval());
   }
 
   // test diagonal
@@ -546,7 +578,7 @@ void test_sparse_basic()
   CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
 
   // Regression test for bug 1105
-#ifdef EIGEN_TEST_PART_6
+#ifdef EIGEN_TEST_PART_7
   {
     int n = Eigen::internal::random<int>(200,600);
     SparseMatrix<std::complex<double>,0, long> mat(n, n);
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index 8a6e0687c..49a5f135e 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -17,6 +17,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   const Index outer = ref.outerSize();
 
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
   double density = (std::max)(8./(rows*cols), 0.01);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
@@ -123,7 +124,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     m3.reserve(VectorXi::Constant(outer,int(inner/2)));
     for(Index j=0; j<outer; ++j)
       for(Index k=0; k<(std::min)(j,inner); ++k)
-        m3.insertByOuterInner(j,k) = k+1;
+        m3.insertByOuterInner(j,k) = internal::convert_index<StorageIndex>(k+1);
     for(Index j=0; j<(std::min)(outer, inner); ++j)
     {
       VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
@@ -150,7 +151,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
     SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
-    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
+    if(internal::random<float>(0,1)>0.5f) m2.makeCompressed();
     Index j0 = internal::random<Index>(0,outer-2);
     Index j1 = internal::random<Index>(0,outer-2);
     Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 7ec5270e8..c7c93373d 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -7,8 +7,26 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
 #include "sparse.h"
 
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    CALL_SUBTEST( XPR ); \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+
+
 template<typename SparseMatrixType> void sparse_product()
 {
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
@@ -76,6 +94,24 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
     VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
 
+    // make sure the right product implementation is called:
+    if((!SparseMatrixType::IsRowMajor) && m2.rows()<=m3.cols())
+    {
+      VERIFY_EVALUATION_COUNT(m4 = m2*m3, 3); // 1 temp for the result + 2 for transposing and get a sorted result.
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).pruned(0), 1);
+      VERIFY_EVALUATION_COUNT(m4 = (m2*m3).eval().pruned(0), 4);
+    }
+
+    // and that pruning is effective:
+    {
+      DenseMatrix Ad(2,2);
+      Ad << -1, 1, 1, 1;
+      SparseMatrixType As(Ad.sparseView()), B(2,2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).eval().nonZeros(), 4);
+      VERIFY_IS_EQUAL( (Ad*Ad.transpose()).eval().sparseView().eval().nonZeros(), 2);
+      VERIFY_IS_EQUAL( (As*As.transpose()).pruned(1e-6).eval().nonZeros(), 2);
+    }
+
     // dense ?= sparse * sparse
     VERIFY_IS_APPROX(dm4 =m2*m3, refMat4 =refMat2*refMat3);
     VERIFY_IS_APPROX(dm4+=m2*m3, refMat4+=refMat2*refMat3);
@@ -245,7 +281,7 @@ template<typename SparseMatrixType> void sparse_product()
     for (int k=0; k<mS.outerSize(); ++k)
       for (typename SparseMatrixType::InnerIterator it(mS,k); it; ++it)
         if (it.index() == k)
-          it.valueRef() *= 0.5;
+          it.valueRef() *= Scalar(0.5);
 
     VERIFY_IS_APPROX(refS.adjoint(), refS);
     VERIFY_IS_APPROX(mS.adjoint(), mS);
@@ -256,6 +292,10 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(x=mUp.template selfadjointView<Upper>()*b, refX=refS*b);
     VERIFY_IS_APPROX(x=mLo.template selfadjointView<Lower>()*b, refX=refS*b);
     VERIFY_IS_APPROX(x=mS.template selfadjointView<Upper|Lower>()*b, refX=refS*b);
+
+    VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView<Upper>()*b, refX+=refS*b);
+    VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView<Lower>()*b, refX-=refS*b);
+    VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView<Upper|Lower>()*b, refX+=refS*b);
     
     // sparse selfadjointView with sparse matrices
     SparseMatrixType mSres(rows,rows);
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index f4aefbb48..5e9607234 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -87,8 +87,8 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_3(B, B),  1);
   VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
   VERIFY_EVALUATION_COUNT( call_ref_3(B.transpose(), B.transpose()),  0);
-  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, AA),  1);
-  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, AA),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, AA),  3);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, AA),  3);
   
   VERIFY(!C.isCompressed());
   VERIFY_EVALUATION_COUNT( call_ref_3(C, C),  1);
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index b67653496..fd6199f3e 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -11,6 +11,33 @@
 #include <Eigen/SparseCore>
 #include <sstream>
 
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(IterativeSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& g, Result &x) {
+  if(internal::random<bool>())
+  {
+    // With a temporary through evaluator<SolveWithGuess>
+    x = solver.derived().solveWithGuess(b,g) + Result::Zero(x.rows(), x.cols());
+  }
+  else
+  {
+    // direct evaluation within x through Assignment<Result,SolveWithGuess>
+    x = solver.derived().solveWithGuess(b.derived(),g);
+  }
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const MatrixBase<Rhs>& b, const Guess& , Result& x) {
+  if(internal::random<bool>())
+    x = solver.derived().solve(b) + Result::Zero(x.rows(), x.cols());
+  else
+    x = solver.derived().solve(b);
+}
+
+template<typename Solver, typename Rhs, typename Guess,typename Result>
+void solve_with_guess(SparseSolverBase<Solver>& solver, const SparseMatrixBase<Rhs>& b, const Guess& , Result& x) {
+  x = solver.derived().solve(b);
+}
+
 template<typename Solver, typename Rhs, typename DenseMat, typename DenseRhs>
 void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
 {
@@ -37,6 +64,12 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+
+    x.setZero();
+    solve_with_guess(solver, b, x, x);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
     
     x.setZero();
     // test the analyze/factorize API
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index d95f301d5..b3e1dda25 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -12,7 +12,7 @@
 template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
 {
   double densityMat = (std::max)(8./(rows*cols), 0.01);
-  double densityVec = (std::max)(8./float(rows), 0.1);
+  double densityVec = (std::max)(8./(rows), 0.1);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
   typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
index 50d1fcdf2..e8605fd21 100644
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@@ -54,7 +54,7 @@ template<typename Scalar> void test_sparseqr_scalar()
   
   b = dA * DenseVector::Random(A.cols());
   solver.compute(A);
-  if(internal::random<float>(0,1)>0.5)
+  if(internal::random<float>(0,1)>0.5f)
     solver.factorize(A);  // this checks that calling analyzePattern is not needed if the pattern do not change.
   if (solver.info() != Success)
   {
diff --git a/test/stdvector.cpp b/test/stdvector.cpp
index 6e173c678..50cb3341d 100644
--- a/test/stdvector.cpp
+++ b/test/stdvector.cpp
@@ -34,7 +34,7 @@ void check_stdvector_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -69,7 +69,7 @@ void check_stdvector_transform(const TransformType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -104,7 +104,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
diff --git a/test/stdvector_overload.cpp b/test/stdvector_overload.cpp
index 736ff0ee7..959665954 100644
--- a/test/stdvector_overload.cpp
+++ b/test/stdvector_overload.cpp
@@ -48,7 +48,7 @@ void check_stdvector_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(MatrixType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(MatrixType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -83,7 +83,7 @@ void check_stdvector_transform(const TransformType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(TransformType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(TransformType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
@@ -118,7 +118,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v[21], y);
   v.push_back(x);
   VERIFY_IS_APPROX(v[22], x);
-  VERIFY((size_t)&(v[22]) == (size_t)&(v[21]) + sizeof(QuaternionType));
+  VERIFY((internal::UIntPtr)&(v[22]) == (internal::UIntPtr)&(v[21]) + sizeof(QuaternionType));
 
   // do a lot of push_back such that the vector gets internally resized
   // (with memory reallocation)
diff --git a/test/svd_common.h b/test/svd_common.h
index d8611b541..605d5dfef 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -42,9 +42,14 @@ void svd_check_full(const MatrixType& m, const SvdType& svd)
   MatrixUType u = svd.matrixU();
   MatrixVType v = svd.matrixV();
   RealScalar scaling = m.cwiseAbs().maxCoeff();
-  if(scaling<=(std::numeric_limits<RealScalar>::min)())
-    scaling = RealScalar(1);
-  VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
+  if(scaling<(std::numeric_limits<RealScalar>::min)())
+  {
+    VERIFY(sigma.cwiseAbs().maxCoeff() <= (std::numeric_limits<RealScalar>::min)());
+  }
+  else
+  {
+    VERIFY_IS_APPROX(m/scaling, u * (sigma/scaling) * v.adjoint());
+  }
   VERIFY_IS_UNITARY(u);
   VERIFY_IS_UNITARY(v);
 }
@@ -141,14 +146,14 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions)
       using std::abs;
       
       SolutionType y(x);
-      y.row(k) = (1.+2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      y.row(k) = (RealScalar(1)+2*NumTraits<RealScalar>::epsilon())*x.row(k);
       RealScalar residual_y = (m*y-rhs).norm();
       VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) ++g_test_level;
       VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) --g_test_level;
       
-      y.row(k) = (1.-2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      y.row(k) = (RealScalar(1)-2*NumTraits<RealScalar>::epsilon())*x.row(k);
       residual_y = (m*y-rhs).norm();
       VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) ++g_test_level;
@@ -336,7 +341,7 @@ void svd_underoverflow()
     M << value_set(id(0)), value_set(id(1)), value_set(id(2)), value_set(id(3));
     svd.compute(M,ComputeFullU|ComputeFullV);
     CALL_SUBTEST( svd_check_full(M,svd) );
-    
+
     id(k)++;
     if(id(k)>=value_set.size())
     {
@@ -344,7 +349,7 @@ void svd_underoverflow()
       id.head(k).setZero();
       k=0;
     }
-    
+
   } while((id<int(value_set.size())).all());
   
 #if defined __INTEL_COMPILER
diff --git a/test/svd_fill.h b/test/svd_fill.h
index 1bbe645ee..3877c0c7e 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -7,9 +7,20 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+template<typename T>
+Array<T,4,1> four_denorms();
+
+template<>
+Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
+template<>
+Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+template<typename T>
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
+
 template<typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0)
 {
+  using std::pow;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef typename MatrixType::Index Index;
@@ -18,7 +29,7 @@ void svd_fill_random(MatrixType &m, int Option = 0)
   s = internal::random<RealScalar>(1,s);
   Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
   for(Index k=0; k<diagSize; ++k)
-    d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+    d(k) = d(k)*pow(RealScalar(10),internal::random<RealScalar>(-s,s));
 
   bool dup     = internal::random<int>(0,10) < 3;
   bool unit_uv = internal::random<int>(0,10) < (dup?7:3); // if we duplicate some diagonal entries, then increase the chance to preserve them using unitary U and V factors
@@ -53,8 +64,9 @@ void svd_fill_random(MatrixType &m, int Option = 0)
     VT.setRandom();
   }
   
-  Matrix<Scalar,Dynamic,1> samples(7);
-  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -1./NumTraits<RealScalar>::highest(), 1./NumTraits<RealScalar>::highest();
+  Matrix<Scalar,Dynamic,1> samples(9);
+  samples << 0, four_denorms<RealScalar>(),
+            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
   
   if(Option==Symmetric)
   {
diff --git a/test/triangular.cpp b/test/triangular.cpp
index 936c2aef3..b96856486 100644
--- a/test/triangular.cpp
+++ b/test/triangular.cpp
@@ -65,7 +65,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   m1 = MatrixType::Random(rows, cols);
   for (int i=0; i<rows; ++i)
-    while (numext::abs2(m1(i,i))<1e-1) m1(i,i) = internal::random<Scalar>();
+    while (numext::abs2(m1(i,i))<RealScalar(1e-1)) m1(i,i) = internal::random<Scalar>();
 
   Transpose<MatrixType> trm4(m4);
   // test back and forward subsitution with a vector as the rhs
@@ -78,7 +78,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   m3 = m1.template triangularView<Lower>();
   VERIFY(v2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView<Lower>().solve(v2)), largerEps));
 
-  // test back and forward subsitution with a matrix as the rhs
+  // test back and forward substitution with a matrix as the rhs
   m3 = m1.template triangularView<Upper>();
   VERIFY(m2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(m2)), largerEps));
   m3 = m1.template triangularView<Lower>();
@@ -121,6 +121,14 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   VERIFY_IS_APPROX(m1.template triangularView<Upper>() * m5, m3*m5);
   VERIFY_IS_APPROX(m6*m1.template triangularView<Upper>(), m6*m3);
 
+  m1up = m1.template triangularView<Upper>();
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Upper>().toDenseMatrix(), m1up);
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+  VERIFY_IS_APPROX(m1up.template selfadjointView<Upper>().template triangularView<Lower>().toDenseMatrix(), m1up.adjoint());
+
+  VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().diagonal(), m1.diagonal());
+
 }
 
 
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index e2f03ffca..731a08977 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -94,7 +94,7 @@ template<typename T>
 void construct_at_boundary(int boundary)
 {
   char buf[sizeof(T)+256];
-  size_t _buf = reinterpret_cast<size_t>(buf);
+  size_t _buf = reinterpret_cast<internal::UIntPtr>(buf);
   _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned
   _buf += boundary; // make exact boundary-aligned
   T *x = ::new(reinterpret_cast<void*>(_buf)) T;
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index ee446c3c1..83c1439ad 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -7,6 +7,14 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+#define EIGEN_UNALIGNED_VECTORIZE 0
+#endif
+
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #undef EIGEN_DEFAULT_TO_ROW_MAJOR
 #endif
@@ -21,7 +29,7 @@ using internal::demangle_unrolling;
 template<typename Dst, typename Src>
 bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 {
-  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar> > traits;
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
   bool res = traits::Traversal==traversal;
   if(unrolling==InnerUnrolling+CompleteUnrolling)
     res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
@@ -45,7 +53,7 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 template<typename Dst, typename Src>
 bool test_assign(int traversal, int unrolling)
 {
-  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar> > traits;
+  typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
   bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
   if(!res)
   {
@@ -65,7 +73,8 @@ bool test_assign(int traversal, int unrolling)
 template<typename Xpr>
 bool test_redux(const Xpr&, int traversal, int unrolling)
 {
-  typedef internal::redux_traits<internal::scalar_sum_op<typename Xpr::Scalar>,internal::redux_evaluator<Xpr> > traits;
+  typedef typename Xpr::Scalar Scalar;
+  typedef internal::redux_traits<internal::scalar_sum_op<Scalar,Scalar>,internal::redux_evaluator<Xpr> > traits;
   
   bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
   if(!res)
@@ -144,10 +153,16 @@ struct vectorization_logic
       InnerVectorizedTraversal,InnerUnrolling));
 
     VERIFY(test_assign(Matrix44u(),Matrix44()+Matrix44(),
-      LinearTraversal,NoUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
+
+    VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(),
+      (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
+      CompleteUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                : LinearTraversal, CompleteUnrolling));
 
     VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
       InnerVectorizedTraversal,CompleteUnrolling));
@@ -158,19 +173,29 @@ struct vectorization_logic
     if(PacketSize>1)
     {
       typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
+      typedef Matrix<Scalar,3,1,ColMajor> Vector3;
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
+      VERIFY(test_assign(Vector3(),Vector3()+Vector3(),
+        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal), CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        LinearTraversal,CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && HalfPacketSize==1) ? NoUnrolling : CompleteUnrolling));
 
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
         LinearVectorizedTraversal,CompleteUnrolling));
 
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling));
+        HalfPacketSize==1             ? InnerVectorizedTraversal  :
+        EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal :
+                                        LinearTraversal,
+        NoUnrolling));
+
+      VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
+
 
       VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
-        DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+        (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
 
       VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
                          InnerVectorizedTraversal,CompleteUnrolling));
@@ -208,7 +233,7 @@ struct vectorization_logic
     VERIFY((test_assign<
             Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
             Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
-            >(DefaultTraversal,CompleteUnrolling)));
+            >(DefaultTraversal,PacketSize>=8?InnerUnrolling:CompleteUnrolling)));
 
     VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(),
                         InnerVectorizedTraversal, CompleteUnrolling)));
@@ -270,6 +295,12 @@ struct vectorization_logic_half
       InnerVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
       InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Vector1().template segment<PacketSize>(0).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(),
+      InnerVectorizedTraversal,CompleteUnrolling));
+    VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment<PacketSize>(0)-Vector1().template segment<PacketSize>(0)).derived(),
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
       InnerVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
@@ -287,10 +318,11 @@ struct vectorization_logic_half
       InnerVectorizedTraversal,InnerUnrolling));
 
     VERIFY(test_assign(Matrix57u(),Matrix57()+Matrix57(),
-      LinearTraversal,NoUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal,
+      EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
         
     if(PacketSize>1)
     {
@@ -298,16 +330,17 @@ struct vectorization_logic_half
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        LinearTraversal,CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
               
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
         PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
         
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        LinearTraversal,NoUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,
+        NoUnrolling));
         
       VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
-        DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
 
       VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
                          InnerVectorizedTraversal,CompleteUnrolling));
@@ -337,7 +370,7 @@ struct vectorization_logic_half
             >(DefaultTraversal,CompleteUnrolling)));
 
     VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
-                        InnerVectorizedTraversal, CompleteUnrolling)));
+                        InnerVectorizedTraversal, InnerUnrolling|CompleteUnrolling)));
     #endif
   }
 };
@@ -367,19 +400,19 @@ void test_vectorization_logic()
   if(internal::packet_traits<float>::Vectorizable)
   {
     VERIFY(test_assign(Matrix<float,3,3>(),Matrix<float,3,3>()+Matrix<float,3,3>(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
       
     VERIFY(test_redux(Matrix<float,5,2>(),
-      DefaultTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
   }
   
   if(internal::packet_traits<double>::Vectorizable)
   {
     VERIFY(test_assign(Matrix<double,3,3>(),Matrix<double,3,3>()+Matrix<double,3,3>(),
-      LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
     
     VERIFY(test_redux(Matrix<double,7,3>(),
-      DefaultTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling));
   }
 #endif // EIGEN_VECTORIZE
 
diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index 3cc198772..739eacaf3 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@@ -233,10 +233,10 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows);
   VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), (MatrixType::RowsAtCompileTime==Dynamic ? 1 : 0));
 
-  m2 = m1.rowwise() - (m1.colwise().sum()/m1.rows()).eval();
-  m1 = m1.rowwise() - (m1.colwise().sum()/m1.rows());
+  m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval();
+  m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows()));
   VERIFY_IS_APPROX( m1, m2 );
-  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/m1.rows()), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) );
+  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) );
 }
 
 void test_vectorwiseop()
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 6d0cf4f9d..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -4,6 +4,7 @@ set(Eigen_HEADERS
   ArpackSupport
   AutoDiff
   BVH
+  EulerAngles
   FFT
   IterativeSolvers 
   KroneckerProduct
@@ -17,6 +18,7 @@ set(Eigen_HEADERS
   Polynomials
   Skyline 
   SparseExtra
+  SpecialFunctions
   Splines
   )
 
@@ -25,5 +27,6 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
-add_subdirectory(CXX11)
\ No newline at end of file
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 1e97ad3c0..4976a1254 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,6 +15,7 @@
 
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
+#include "../SpecialFunctions"
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
@@ -60,15 +61,17 @@ typedef unsigned __int64 uint64_t;
 #ifdef EIGEN_USE_GPU
 #include <iostream>
 #include <cuda_runtime.h>
-#if defined(__CUDACC__)
-#include <curand_kernel.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
 #endif
 #endif
 
-
 #include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
 #include "src/Tensor/TensorDeviceCuda.h"
@@ -77,13 +80,13 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorDimensions.h"
 #include "src/Tensor/TensorInitializer.h"
 #include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorRandom.h"
 #include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
 
-#include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
@@ -115,6 +118,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorForcedEval.h"
 #include "src/Tensor/TensorGenerator.h"
 #include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
 
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Tensor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index eeca2f69e..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
 
 As a special case, if you pass no parameter to a reduction operation the
 original tensor is reduced along *all* its dimensions.  The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
 
     Eigen::Tensor<float, 3> a(2, 3, 4);
     a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
                   {19.0f, 18.0f, 17.0f, 16.0f},
                   {20.0f, 21.0f, 22.0f, 23.0f}}});
     // Reduce along all dimensions using the sum() operator.
-    Eigen::Tensor<float, 1> b = a.sum();
+    Eigen::Tensor<float, 0> b = a.sum();
     cout << "b" << endl << b << endl << endl;
     =>
     b
@@ -1168,6 +1168,44 @@ Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
 in TensorFunctors.h for information on how to implement a reduction operator.
 
 
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {4, 5, 6}});
+    // Scan it along the second dimension (1) using summation
+    Eigen::Tensor<int, 2> b = a.cumsum(1);
+    // The result is a tensor with the same size as the input
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    1  3  6
+    4  9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
 ## Convolutions
 
 ### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 759dede3f..1940a9692 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -110,7 +110,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -150,7 +150,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
@@ -190,7 +190,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -257,7 +257,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
@@ -336,7 +336,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
         : m_storage(firstDimension, otherDimensions...)
@@ -350,22 +350,22 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
       : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
     {
       EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
       : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
     {
       EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
       : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
     {
       EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
       : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
     {
       EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -418,7 +418,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return *this;
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index babafe108..d06f40cd8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -254,6 +254,14 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = 1.0 +
+        (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+    return m_orig_impl.costPerCoeff(vectorized) +
+           m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
  private:
   EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
     if (m_return_dim < 0) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1a34f3ccc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -191,6 +191,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_log_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
     pow(Scalar exponent) const {
-      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
     operator+ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
     operator- (Scalar rhs) const {
       EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
     operator* (Scalar rhs) const {
-      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
     operator/ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_floor_op<Scalar>());
     }
 
-
     // Generic binary operation support.
     template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
 
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
     operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
     operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
     operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
     operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
     operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
     // comparisons and tests for Scalars
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<(Scalar threshold) const {
       return operator<(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<=(Scalar threshold) const {
       return operator<=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>(Scalar threshold) const {
       return operator>(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>=(Scalar threshold) const {
       return operator>=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator==(Scalar threshold) const {
       return operator==(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator!=(Scalar threshold) const {
       return operator!=(constant(threshold));
     }
@@ -453,6 +498,28 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
     }
 
+    // Scan.
+    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanSumOp
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
+    }
+
+    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanProdOp
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+    }
+
     // Reductions.
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
@@ -676,6 +743,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     slice(const StartIndices& startIndices, const Sizes& sizes) const {
       return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
     }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
     template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
@@ -750,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
@@ -761,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -780,7 +853,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
       return derived() = this->template random<RandomGenerator>();
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setValues(
         const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
@@ -851,6 +924,19 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
       return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
     }
 
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                Derived>(derived(), startIndices, stopIndices, strides);
+    }
+
     template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index c771496e2..5d67f69f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
@@ -118,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
     const Broadcast& broadcast = op.broadcast();
     for (int i = 0; i < NumDims; ++i) {
@@ -247,7 +247,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
@@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
@@ -354,11 +354,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     if (NumDims > 0) {
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
           compute_cost +=
               TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
         } else {
-          if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
             compute_cost += TensorOpCost::MulCost<Index>() +
                             TensorOpCost::ModCost<Index>() +
                             TensorOpCost::AddCost<Index>();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 2742dbb95..1ba7ef170 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
-    // We could also support the case where NumInputDims==1 if needed.
-    EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(NumInputDims > m_dim.actualDim());
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -203,7 +202,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
@@ -342,7 +341,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
 	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 839c6e3e5..59bf90d93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -128,8 +128,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     eigen_assert(0 <= m_axis && m_axis < NumDims);
     const Dimensions& lhs_dims = m_leftImpl.dimensions();
@@ -248,8 +248,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
@@ -344,8 +344,8 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 6f113b903..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -37,7 +38,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
 
   enum {
@@ -65,7 +66,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
   typedef Device_ Device;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
 }  // end namespace internal
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
@@ -140,11 +141,11 @@ struct TensorContractionEvaluatorBase
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -218,11 +219,9 @@ struct TensorContractionEvaluatorBase
       rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
     }
 
-    m_i_strides[0] = 1;
-    m_j_strides[0] = 1;
-    if(ContractDims) {
-        m_k_strides[0] = 1;
-    }
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
 
     m_i_size = 1;
     m_j_size = 1;
@@ -318,11 +317,6 @@ struct TensorContractionEvaluatorBase
       }
     }
 
-    // Scalar case. We represent the result as a 1d tensor of size 1.
-    if (LDims + RDims == 2 * ContractDims) {
-      m_dimensions[0] = 1;
-    }
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -510,7 +504,7 @@ struct TensorContractionEvaluatorBase
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
         }
       }
     }
@@ -607,15 +601,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   static const int ContractDims = internal::array_size<Indices>::value;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 6a3ef14ef..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 #undef writeResultShmem
 #undef writeRow
 
-  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
 
   if (threadIdx.x < max_i_write) {
     if (max_j_write == 8) {
@@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val = left ? col : row;
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx = contract_val / m_k_strides[i];
-      linidx += idx * m_contract_strides[i];
-      contract_val -= idx * m_k_strides[i];
-    }
-
     if(array_size<contract_t>::value > 0) {
-        if (side == Rhs && inner_dim_contiguous) {
-            eigen_assert(m_contract_strides[0] == 1);
-            linidx += contract_val;
-        } else {
-            linidx += contract_val * m_contract_strides[0];
-        }
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
     }
 
     return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
     const bool left = (side == Lhs);
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-      linidx[0] += idx0 * m_nocontract_strides[i];
-      linidx[1] += idx1 * m_nocontract_strides[i];
-      nocontract_val[0] -= idx0 * m_ij_strides[i];
-      nocontract_val[1] -= idx1 * m_ij_strides[i];
-    }
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
       if (side == Lhs && inner_dim_contiguous) {
         eigen_assert(m_nocontract_strides[0] == 1);
         linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = contract_val[0] / m_k_strides[i];
-      const Index idx1 = contract_val[1] / m_k_strides[i];
-      linidx[0] += idx0 * m_contract_strides[i];
-      linidx[1] += idx1 * m_contract_strides[i];
-      contract_val[0] -= idx0 * m_k_strides[i];
-      contract_val[1] -= idx1 * m_k_strides[i];
-    }
+    if (array_size<contract_t>::value> 0) {
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
 
-    if (side == Rhs && inner_dim_contiguous) {
-      eigen_assert(m_contract_strides[0] == 1);
-      linidx[0] += contract_val[0];
-      linidx[1] += contract_val[1];
-    } else {
-      linidx[0] += contract_val[0] * m_contract_strides[0];
-      linidx[1] += contract_val[1] * m_contract_strides[0];
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
     }
     return IndexPair<Index>(linidx[0], linidx[1]);
   }
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
     return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
-    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9044454fd..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -14,6 +14,8 @@
 #ifdef EIGEN_USE_THREADS
 
 namespace Eigen {
+
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
 namespace internal {
 
 template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -52,7 +54,7 @@ struct packRhsAndKernelArg {
 };
 
 }  // end namespace internal
-
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
 
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -92,10 +94,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -110,6 +112,623 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    typedef
+        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
+            LhsScalar;
+    typedef
+        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
+            RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+                                    typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor>
+        LhsPacker;
+    typedef internal::gemm_pack_rhs<
+        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+        RhsPacker;
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false>
+        GebpKernel;
+
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
+
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
+
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
+
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost =
+        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        static_cast<double>(n) * m, cost, this->m_device.numThreads());
+
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
+
+    if (num_threads == 1) {
+      // The single-threaded algorithm should be faster in this case.
+      if (n == 1)
+        this->template evalGemv<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      else
+        this->template evalGemm<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Number of kernels for each dimension.
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index nk = divup(k, bk);
+
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = divup(nm0, gm);
+    Index nn = divup(nn0, gn);
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+        l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+                  this->m_i_strides, this->m_left_contracting_strides,
+                  this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+                  this->m_j_strides, this->m_right_contracting_strides,
+                  this->m_k_strides);
+
+    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+                          shard_by_col, parallel_pack)
+        .run();
+  }
+
+  // Context coordinates a single parallel gemm operation.
+  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  class Context {
+   public:
+    Context(const Device& device, int num_threads, LhsMapper& lhs,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+            Index gn, Index nm0, Index nn0, bool shard_by_col,
+            bool parallel_pack)
+        : device_(device),
+          lhs_(lhs),
+          rhs_(rhs),
+          buffer_(buffer),
+          output_(buffer, tm),
+          num_threads_(num_threads),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          m_(tm),
+          n_(tn),
+          k_(tk),
+          bm_(bm),
+          bn_(bn),
+          bk_(bk),
+          nm_(nm),
+          nn_(nn),
+          nk_(nk),
+          gm_(gm),
+          gn_(gn),
+          nm0_(nm0),
+          nn0_(nn0)
+  {
+      for (Index x = 0; x < P; x++) {
+        // Normal number of notifications for k slice switch is
+        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+        // nm_ + nn_ notifications, because they will not receive notifications
+        // from preceeding kernels.
+        state_switch_[x] =
+            x == 0
+                ? 1
+                : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+                      (x == P - 1 ? nm_ * nn_ : 0);
+        state_packing_ready_[x] =
+            parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+        for (Index m = 0; m < nm_; m++) {
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+          // Kernels generally receive 3 notifications (previous kernel + 2
+          // packing), but the first slice won't get notifications from previous
+          // kernels.
+          for (Index n = 0; n < nn_; n++)
+            state_kernel_[x][m][n].store(
+                (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+                std::memory_order_relaxed);
+        }
+      }
+
+      // Allocate memory for packed rhs/lhs matrices.
+      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+      size_t lhs_size =
+          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+      size_t rhs_size =
+          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+      char* mem = static_cast<char*>(packed_mem_);
+      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+        packed_lhs_[x].resize(nm0_);
+        for (Index m = 0; m < nm0_; m++) {
+          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+          mem += lhs_size;
+        }
+        packed_rhs_[x].resize(nn0_);
+        for (Index n = 0; n < nn0_; n++) {
+          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+          mem += rhs_size;
+        }
+      }
+    }
+
+    ~Context() {
+      for (Index x = 0; x < P; x++) {
+        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+        delete[] state_kernel_[x];
+      }
+      internal::aligned_free(packed_mem_);
+    }
+
+    void run() {
+      // Kick off packing of the first slice.
+      signal_switch(0, 1);
+      // Wait for overall completion.
+      // TODO(dvyukov): this wait can lead to deadlock.
+      // If nthreads contractions are concurrently submitted from worker
+      // threads, this wait will block all worker threads and the system will
+      // deadlock.
+      done_.Wait();
+    }
+
+   private:
+    Notification done_;
+    const Device& device_;
+    LhsMapper& lhs_;
+    RhsMapper& rhs_;
+    Scalar* const buffer_;
+    OutputMapper output_;
+    const int num_threads_;
+    const bool shard_by_col_;
+    const bool parallel_pack_;
+    // Matrix sizes.
+    const Index m_;
+    const Index n_;
+    const Index k_;
+    // Block sizes.
+    const Index bm_;
+    const Index bn_;
+    const Index bk_;
+    // Number of tasks.
+    const Index nm_;
+    const Index nn_;
+    const Index nk_;
+    // Task grain sizes (number of kernels executed per task).
+    const Index gm_;
+    const Index gn_;
+    // Number of blocks (this is different from ni_/nn_ because of task size
+    // coarsening).
+    const Index nm0_;
+    const Index nn0_;
+
+    // Parallelization strategy.
+    //
+    // Blocks related to the same k block can run in parallel because they write
+    // to different output blocks. So we parallelize within k slices, this
+    // gives us parallelism level of m x n. Before we can start any kernels
+    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+    // packing tasks.
+    //
+    // However, there is a bottleneck when we are finishing kernels for k-th
+    // slice (at the very end there is only 1 runnable kernel). To mitigate this
+    // bottleneck we allow kernels from k-th and k+1-th slices to run in
+    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+    // output block, so they must not run in parallel.
+    //
+    // This gives us the following dependency graph.
+    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // packing tasks.
+    // Kernel (m, n, k) can start when:
+    //  - kernel (m, n, k-1) has finished
+    //  - lhs packing (m, k) has finished
+    //  - rhs packing (n, k) has finished
+    // Lhs/rhs packing can start when:
+    //  - all k-1 packing has finished (artificially imposed to limit amount of
+    //  parallel packing)
+    //
+    // On top of that we limit runnable tasks to two consecutive k slices.
+    // This is done to limit amount of memory we need for packed lhs/rhs
+    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+    //
+    // state_switch_ tracks when we are ready to switch to the next k slice.
+    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+    // These variable are rolling over 3 consecutive k slices: first two we are
+    // actively executing + one to track completion of kernels in the second
+    // slice.
+    static const Index P = 3;
+    void* packed_mem_;
+    std::vector<LhsScalar*> packed_lhs_[P - 1];
+    std::vector<RhsScalar*> packed_rhs_[P - 1];
+    std::atomic<uint8_t>** state_kernel_[P];
+    // state_switch_ is frequently modified by worker threads, while other
+    // fields are read-only after constructor. Let's move it to a separate cache
+    // line to reduce cache-coherency traffic.
+    char pad_[128];
+    std::atomic<Index> state_packing_ready_[P];
+    std::atomic<Index> state_switch_[P];
+
+    void pack_lhs(Index m, Index k) {
+      const Index mend = m * gm_ + gm(m);
+      for (Index m1 = m * gm_; m1 < mend; m1++)
+        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+      if (!parallel_pack_ && shard_by_col_) {
+        signal_packing(k);
+      } else {
+        signal_switch(k + 1);
+        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+      }
+    }
+
+    void pack_rhs(Index n, Index k) {
+      const Index nend = n * gn_ + gn(n);
+      for (Index n1 = n * gn_; n1 < nend; n1++) {
+        if (k == 0) {
+          // Zero the output memory in parallel.
+          // On 10000x2x10000 mm zeroing can easily take half of time.
+          // Zero (bn x m) row. Safe to do here because all kernels that will
+          // write to this memory depend on completion of this task.
+          // Note: don't call device_.memset() here. device_.memset() blocks on
+          // thread pool worker thread, which can lead to underutilization and
+          // deadlocks.
+          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+        }
+        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+      }
+
+      if (parallel_pack_ || shard_by_col_) {
+        signal_switch(k + 1);
+        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+      } else {
+        signal_packing(k);
+      }
+    }
+
+    void kernel(Index m, Index n, Index k) {
+      // Note: order of iteration matters here. Iteration over m is innermost
+      // because we want to reuse the same packed rhs in consequetive tasks
+      // (rhs fits into L2$ while lhs only into L3$).
+      const Index nend = n * gn_ + gn(n);
+      const Index mend = m * gm_ + gm(m);
+      if (shard_by_col_) {
+        for (Index n1 = n * gn_; n1 < nend; n1++) {
+          for (Index m1 = m * gm_; m1 < mend; m1++)
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+        }
+      } else {
+        for (Index m1 = m * gm_; m1 < mend; m1++)
+          for (Index n1 = n * gn_; n1 < nend; n1++) {
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+          }
+      }
+      signal_kernel(m, n, k + 1, false);
+      signal_switch(k + 2);
+    }
+
+    void signal_packing(Index k) {
+      eigen_assert(!parallel_pack_);
+      Index s = state_packing_ready_[k % P].fetch_sub(1);
+      eigen_assert(s > 0);
+      if (s != 1) return;
+      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+      enqueue_packing(k, shard_by_col_);
+    }
+
+    void signal_kernel(Index m, Index n, Index k, bool sync) {
+      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+      Index s = state->load();
+      eigen_assert(s > 0);
+      if (s != 1 && state->fetch_sub(1) != 1) return;
+      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+      if (sync)
+        kernel(m, n, k);
+      else
+        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+    }
+
+    void signal_switch(Index k, Index v = 1) {
+      Index s = state_switch_[k % P].fetch_sub(v);
+      eigen_assert(s >= v);
+      if (s != v) return;
+
+      // Ready to switch to the next k slice.
+      // Reset counter for the next iteration.
+      state_switch_[k % P] =
+          (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+          nm_ * nn_;
+      if (k < nk_) {
+        // Issue lhs/rhs packing. Their completion will in turn kick off
+        // kernels.
+        if (parallel_pack_) {
+          enqueue_packing(k, !shard_by_col_);
+          enqueue_packing(k, shard_by_col_);
+        } else if (shard_by_col_) {
+          enqueue_packing(k, false);
+        } else {
+          enqueue_packing(k, true);
+        }
+
+        // Termination handling.
+        // Because kernel completion signals k + 2 switch, we need to finish nk
+        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+        // pretend that all nk + 1 packing tasks just finish instantly; so that
+        // nk + 2 switch only waits for completion of nk kernels.
+      } else if (k == nk_) {
+        signal_switch(k + 1,
+                      parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+      } else {
+        done_.Notify();
+      }
+    }
+
+    // Enqueue all rhs/lhs packing for k-th slice.
+    void enqueue_packing(Index k, bool rhs) {
+      enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+    }
+
+    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+      if (end - start == 1) {
+        if (rhs)
+          pack_rhs(start, k);
+        else
+          pack_lhs(start, k);
+      } else {
+        Index mid = (start + end) / 2;
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+      }
+    }
+
+    // Block sizes with accounting for potentially incomplete last block.
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    // Task grain sizes accounting for potentially incomplete last task.
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+    Context(const Context&) = delete;
+    void operator=(const Context&) = delete;
+  };
+
+  // Decide whether we want to shard m x n contraction by columns or by rows.
+  static bool shardByCol(Index m, Index n, Index num_threads) {
+    // Note: we are comparing both n and m against Traits::nr, it is not
+    // a mistake. We are trying to figure out how both n and m will fit into
+    // the main sharding dimension.
+
+    // Sharding by column is the default
+    // ... unless there is enough data for vectorization over rows
+    if (m / num_threads >= Traits::nr &&
+        // and not enough data for vectorization over columns
+        (n / num_threads < Traits::nr ||
+         // ... or barely enough data for vectorization over columns,
+         // but it is not evenly dividable across threads
+         (n / num_threads < 4 * Traits::nr &&
+          (n % (num_threads * Traits::nr)) != 0 &&
+          // ... and it is evenly dividable across threads for rows
+          ((m % (num_threads * Traits::nr)) == 0 ||
+           // .. or it is not evenly dividable for both dimensions but
+           // there is much more data over rows so that corner effects are
+           // mitigated.
+           (m / n >= 6)))))
+      return false;
+    // Wait, or if matrices are just substantially prolonged over the other
+    // dimension.
+    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+    return true;
+  }
+
+  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+                 int num_threads, bool shard_by_col) const {
+    Index gm = 1;
+    Index gm1 = 1;
+    Index nm0 = divup(m, bm);
+    Index nm1 = nm0;
+    for (;;) {
+      // Find the next candidate for m grain size. It needs to result in
+      // different number of blocks. E.g. if we have 10 kernels, we want to try
+      // 5 and 10, but not 6, 7, 8 and 9.
+      while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+      if (gm1 > nm0) break;
+      // Check the candidate.
+      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nm1 = divup(nm0, gm1);
+      if (res == 0) continue;
+      // Commit new grain size.
+      gm = gm1;
+    }
+    return gm;
+  }
+
+  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 int num_threads, bool shard_by_col) const {
+    Index gn = 1;
+    Index gn1 = 1;
+    Index nn0 = divup(n, bn);
+    Index nn1 = nn0;
+    for (;;) {
+      while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+      if (gn1 > nn0) break;
+      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nn1 = divup(nn0, gn1);
+      if (res == 0) continue;
+      gn = gn1;
+    }
+    return gn;
+  }
+
+  // checkGrain checks whether grain (gm, gn) is suitable and is better than
+  // (oldgm, oldgn).
+  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 Index gn, Index oldgm, Index oldgn, int num_threads,
+                 bool shard_by_col) const {
+    const TensorOpCost cost =
+        contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+        static_cast<double>(bm) * gm * bn * gn, cost);
+    // If the task is too small, then we agree on it regardless of anything
+    // else. Otherwise synchronization overheads will dominate.
+    if (taskSize < 1) return 1;
+    // If it is too large, then we reject it and all larger tasks.
+    if (taskSize > 2) return -1;
+    // Now we are in presumably good task size range.
+    // The main deciding factor here is parallelism. Consider that we have 12
+    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+    // us parallelism of 1 (we can load all cores).
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+    double new_parallelism = static_cast<double>(new_tasks) /
+                             (divup<int>(new_tasks, num_threads) * num_threads);
+    Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+    double old_parallelism = static_cast<double>(old_tasks) /
+                             (divup<int>(old_tasks, num_threads) * num_threads);
+    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+    return 0;
+  }
+
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL
+
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
     if (this->m_j_size == 1) {
@@ -376,7 +995,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
         gebp(arg.output.getSubMapper(m_base_start, arg.n),
              (*arg.blockAs)[blockAId], arg.blockB,
-             actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
 
         // Notify that the kernel is done.
         const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
@@ -384,6 +1003,47 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
     }
   }
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+                               bool shard_by_col, bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+                                          PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1 ? 4.0 :
+          (shard_by_col ? bn : bm) < Traits::nr ||
+          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
+    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a2f1f71f5..860a6949a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -164,14 +164,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
 };
 
 template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
 template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
@@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   enum {
     IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    PacketAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
@@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-    PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
-                    SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
-    return converter.template packet<LoadMode>(index);
+    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   protected:
-    TensorEvaluator<ArgType, Device> m_impl;
+  template <int LoadMode, bool ActuallyVectorize>
+  struct PacketConv {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      internal::scalar_cast_op<SrcType, TargetType> converter;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = converter(impl.coeff(index+i));
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode>
+  struct PacketConv<LoadMode, true> {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+      return converter.template packet<LoadMode>(index);
+    }
+  };
+
+  TensorEvaluator<ArgType, Device> m_impl;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 091007ab7..abdf742c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t
 
 
 template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 0f6dcedaa..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 
-//#if !defined(EIGEN_USE_GPU)
-//#define EIGEN_USE_COST_MODEL
-//#endif
-
 namespace Eigen {
 
 /** \class TensorEvaluator
@@ -32,45 +28,47 @@ class TensorOpCost {
   // model based on minimal reciprocal throughput numbers from Intel or
   // Agner Fog's tables would be better than what is there now.
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
     return internal::functor_traits<
         internal::scalar_product_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
     return internal::functor_traits<
         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
   }
   template <typename SrcType, typename TargetType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
     return internal::functor_traits<
         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
   }
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(compute_cycles) {}
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
                bool vectorized, double packet_size)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(vectorized ? compute_cycles / packet_size
                                    : compute_cycles) {
-    using std::isfinite;
-    eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
-    eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
-    eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
@@ -96,21 +94,21 @@ class TensorOpCost {
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
 
 namespace Eigen {
 
+static const int kCudaScratchSize = 1024;
+
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@@ -24,6 +26,15 @@ class StreamInterface {
   // Allocate memory on the actual device where the computation will run
   virtual void* allocate(size_t num_bytes) const = 0;
   virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
 };
 
 static cudaDeviceProp* m_deviceProperties;
@@ -31,7 +42,21 @@ static bool m_devicePropInitialized = false;
 
 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
-    if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
       if (status != cudaSuccess) {
@@ -52,7 +77,19 @@ static void initializeDeviceProp() {
           assert(status == cudaSuccess);
         }
       }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
       m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
     }
   }
 }
@@ -62,12 +99,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
@@ -75,7 +112,7 @@ class CudaStreamDevice : public StreamInterface {
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device) {
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
@@ -89,6 +126,12 @@ class CudaStreamDevice : public StreamInterface {
     initializeDeviceProp();
   }
 
+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
   const cudaStream_t& stream() const { return *stream_; }
   const cudaDeviceProp& deviceProperties() const {
     return m_deviceProperties[device_];
@@ -112,9 +155,29 @@ class CudaStreamDevice : public StreamInterface {
     assert(err == cudaSuccess);
   }
 
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
  private:
   const cudaStream_t* stream_;
   int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
 };
 
 struct GpuDevice {
@@ -131,22 +194,20 @@ struct GpuDevice {
     return stream_->stream();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return stream_->allocate(num_bytes);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     stream_->deallocate(buffer);
+  }
 
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -156,30 +217,22 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -188,21 +241,21 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE size_t numThreads() const {
     // FIXME
     return 32;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     // FIXME
     return 48*1024;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // We won't try to take advantage of the l2 cache for the time being, and
     // there is no l3 cache on cuda devices.
     return firstLevelCacheSize();
@@ -222,56 +275,26 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
     return stream_->deviceProperties().multiProcessorCount;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
     return stream_->deviceProperties().maxThreadsPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
     return stream_->deviceProperties().sharedMemPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     return stream_->deviceProperties().major;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
     return stream_->deviceProperties().minor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+  EIGEN_STRONG_INLINE int maxBlocks() const {
     return max_blocks_;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index c02891465..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 // Use the SimpleThreadPool by default. We'll switch to the new non blocking
 // thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
 template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
 typedef NonBlockingThreadPool ThreadPool;
 #else
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
 
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
-    std::function<void()> func =
-      std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
     return n;
   }
 
@@ -161,20 +159,118 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
                                                 Function&& f,
                                                 Args&&... args) const {
-    std::function<void()> func = std::bind(
-        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
   }
 
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    std::function<void()> func = std::bind(f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(f, args...));
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const {
+    return pool_->CurrentThreadId();
+  }
+
+  // parallelFor executes f with [0, n) arguments in parallel and waits for
+  // completion. F accepts a half-open interval [first, last).
+  // Block size is choosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    if (n <= 1 || numThreads() == 1 ||
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Calculate block size based on (1) the iteration cost and (2) parallel
+    // efficiency. We want blocks to be not too small to mitigate
+    // parallelization overheads; not too large to mitigate tail
+    // effect and potential load imbalance and we also want number
+    // of blocks to be evenly dividable across threads.
+
+    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
+    const Index max_block_size =
+        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+    Index block_count = divup(n, block_size);
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) /
+        (divup<int>(block_count, numThreads()) * numThreads());
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = divup(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = divup(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency =
+          static_cast<double>(coarser_block_count) /
+          (divup<int>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block_count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+      if (last - first <= block_size) {
+        // Single block or less, execute directly.
+        f(first, last);
+        barrier.Notify();
+        return;
+      }
+      // Split into halves and submit to the pool.
+      Index mid = first + divup((last - first) / 2, block_size) * block_size;
+      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+    };
+    handleRange(0, n);
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
   }
 
  private:
   ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index ca9ac79df..1a30e45fb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -44,7 +44,7 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c
 }
 
 
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
 template <typename Index, std::size_t Rank>
 struct index_known_statically_impl<DimensionList<Index, Rank> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f0b8ac958..b24cdebf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,14 +29,6 @@ namespace Eigen {
   * \sa Tensor
   */
 
-// Can't use std::pair on cuda devices
-template <typename Index> struct IndexPair {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
-  Index first;
-  Index second;
-};
-
 // Boilerplate code
 namespace internal {
 
@@ -115,7 +107,7 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
   explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
     // todo: add assertion
@@ -182,7 +174,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
     return *this;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
   explicit Sizes(std::initializer_list<std::size_t>) {
     // todo: add assertion
@@ -190,13 +182,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 #else
   EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
 #endif
 
@@ -290,31 +282,31 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
     EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 #else
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
     eigen_assert(NumDims == 2);
     (*this)[0] = i0;
     (*this)[1] = i1;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
     eigen_assert(NumDims == 3);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
     eigen_assert(NumDims == 4);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
     (*this)[3] = i3;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
     eigen_assert(NumDims == 5);
     (*this)[0] = i0;
     (*this)[1] = i1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index c556fec0f..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType>
 
 
 template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = true,
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ae4ce3c90..61c111cec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double loadConstant(const double* address) {
   return __ldg(address);
 }
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
 #endif
 }
 
@@ -222,7 +226,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
 
   typedef typename XprType::Index Index;
@@ -239,13 +243,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
-    return m_functor(index);
+    return m_wrapper(m_functor, index);
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_functor.template packetOp<Index, PacketReturnType>(index);
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -259,6 +263,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  private:
   const NullaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
 
@@ -399,6 +404,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+                   internal::functor_traits<TernaryOp>::PacketAccess,
+    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_arg1Impl(op.arg1Expression(), device),
+      m_arg2Impl(op.arg2Expression(), device),
+      m_arg3Impl(op.arg3Expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+                              m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) +
+           m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
 
 // -------------------- SelectOp --------------------
 
@@ -475,7 +575,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
 
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5c3d4d630..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
       const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
       for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
       }
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
       for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@@ -92,24 +93,30 @@ struct EvalRange {
       evaluator.evalScalar(i);
     }
   }
+
+  static Index alignBlockSize(Index size) {
+    return size;
+  }
 };
 
 template <typename Evaluator, typename Index>
 struct EvalRange<Evaluator, Index, true> {
+  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
   static void run(Evaluator* evaluator_in, const Index first, const Index last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
     Index i = first;
-    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
       Index last_chunk_offset = last - 4 * PacketSize;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
       for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
       }
       last_chunk_offset = last - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
@@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> {
       evaluator.evalScalar(i);
     }
   }
+
+  static Index alignBlockSize(Index size) {
+    // Align block size to packet size and account for unrolling in run above.
+    if (size >= 16 * PacketSize) {
+      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+    }
+    // Aligning to 4 * PacketSize would increase block size by more than 25%.
+    return (size + PacketSize - 1) & ~(PacketSize - 1);
+  }
 };
 
 template <typename Expression, bool Vectorizable>
@@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
-      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+                         [&evaluator](Index first, Index last) {
+                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         });
+#else
       size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
       if (num_threads > 1) {
         num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
             size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
-#endif
       if (num_threads == 1) {
         EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
       } else {
+        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
         Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
         const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
         const Index numblocks = size / blocksize;
@@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
         }
         barrier.Wait();
       }
+#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
-#endif
+#endif  // EIGEN_USE_THREADS
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
@@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
 
-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
-
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
   EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 8491c4ca2..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -218,6 +218,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
 };
 
 
+namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<
+      TernaryOp(typename Arg1XprType::Scalar,
+                typename Arg2XprType::Scalar,
+                typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+        : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const TernaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg1Expression() const { return m_arg1_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg2Expression() const { return m_arg2_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+    arg3Expression() const { return m_arg3_xpr; }
+
+  protected:
+    typename Arg1XprType::Nested m_arg1_xpr;
+    typename Arg1XprType::Nested m_arg2_xpr;
+    typename Arg3XprType::Nested m_arg3_xpr;
+    const TernaryOp m_functor;
+};
+
+
 namespace internal {
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
 struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
@@ -252,7 +332,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e
 
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
 {
   public:
     typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[i]);
+        b[i] = numext::conj(pos_j_base_powered[i]);
       }
     }
     for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[m-i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[m-i]);
+        b[i] = numext::conj(pos_j_base_powered[m-i]);
       }
     }
 
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index b27ee0084..fcee5f60d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -65,7 +65,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
     {
@@ -97,7 +97,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     }
 
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
     {
@@ -128,7 +128,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return m_storage.data()[0];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
@@ -213,7 +213,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
@@ -309,7 +309,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
     }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
       : m_storage(other.m_storage)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 7ec757519..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X
 
 
 template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    const Index numValues = m_impl.dimensions().TotalSize();
+    const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
     if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a8bd8b888..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
 
 template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
 template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
 template<typename XprType> class TensorIndexTupleOp;
@@ -42,9 +43,11 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
 template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
 template<typename Shuffle, typename XprType> class TensorShufflingOp;
 template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
 template<typename Strides, typename XprType> class TensorInflationOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 33cd00391..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 
 /** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 template <typename Scalar>
 struct scalar_fmod_op {
@@ -69,7 +69,7 @@ struct scalar_sigmoid_op {
 
   template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(1);
+    const Packet one = pset1<Packet>(T(1));
     return pdiv(one, padd(one, pexp(pnegate(x))));
   }
 };
@@ -84,14 +84,23 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };
 
 
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasAdd;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -119,16 +128,26 @@ template <typename T> struct SumReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = !NumTraits<T>::IsInteger;
+  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
   static const bool IsStateful = true;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
     scalarCount_++;
   }
   template <typename Packet>
@@ -162,9 +181,44 @@ template <typename T> struct MeanReducer
     DenseIndex packetCount_;
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::infinity();
+  }
+};
+
+
 template <typename T> struct MaxReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMax;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -174,9 +228,8 @@ template <typename T> struct MaxReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmax<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::lowest();
+    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -195,9 +248,18 @@ template <typename T> struct MaxReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMin;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -207,9 +269,8 @@ template <typename T> struct MinReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmin<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::highest();
+    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -228,10 +289,18 @@ template <typename T> struct MinReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin
+  };
+};
+
 
 template <typename T> struct ProdReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMul;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -263,6 +332,14 @@ template <typename T> struct ProdReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul
+  };
+};
+
 
 struct AndReducer
 {
@@ -280,6 +357,15 @@ struct AndReducer
   }
 };
 
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
   static const bool PacketAccess = false;
   static const bool IsStateful = false;
@@ -295,6 +381,15 @@ struct OrReducer {
   }
 };
 
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@@ -312,6 +407,15 @@ template <typename T> struct ArgMaxTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
   static const bool PacketAccess = false;
@@ -328,457 +432,11 @@ template <typename T> struct ArgMinTupleReducer
   }
 };
 
-
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
-    return clock();
-}
-#else
-int get_random_seed() {
-#ifdef _WIN32
-    SYSTEMTIME st;
-    GetSystemTime(&st);
-    return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
-    return static_cast<int>(mach_absolute_time());
-#else
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    if (!deterministic) {
-      srand(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-  }
-
-  template<typename Index>
-  T operator()(Index) const {
-    return random<T>();
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = random<T>();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  template<typename Index>
-  float operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX float values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  template<typename Index>
-  double operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX double values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-     curand_init(seed, tid, 0, &m_state);
-  }
-
-  template<typename Index>
-  __device__ float operator()(Index) const {
-    return curand_uniform(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ float4 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ double operator()(Index) const {
-    return curand_uniform_double(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
-    float4 vals = curand_uniform4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
-    double2 vals = curand_uniform2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
   enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  NormalRandomGenerator(const NormalRandomGenerator& other)
-      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other(0) * UINT_MAX);
-  }
-  ~NormalRandomGenerator() {
-    delete m_generator;
-  }
-  template<typename Index>
-  T operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = m_distribution(*m_generator);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  // No assignment
-  NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
-  bool m_deterministic;
-  mutable std::normal_distribution<T> m_distribution;
-  std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ float operator()(Index) const {
-    return curand_normal(&m_state);
-  }
-  template<typename Index, typename PacketType>
-   __device__ float4 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ double operator()(Index) const {
-    return curand_normal_double(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
-    float4 vals = curand_normal4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
-    double2 vals = curand_normal2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = false;
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
-  bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
   };
 };
 
@@ -797,7 +455,7 @@ class GaussianGenerator {
     }
   }
 
-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 8ff7d5815..eb1d4934e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -134,7 +134,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                         const ADerived, const BDerived, const XDerived>
+    betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+  return TensorCwiseTernaryOp<
+      internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+      const BDerived, const XDerived>(
+      a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
 namespace Eigen {
 
 namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
-    : significant_decimals_default_impl<std::string, true>
-{};
-}
 
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      static const int layout = Tensor::Layout;
+      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+      os << matrix;
+    }
+  }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+      os << array;
+    }
+  }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    os << tensor.coeff(0);
+  }
+};
+}
 
 template <typename T>
 std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+  typedef typename Evaluator::Dimensions Dimensions;
+
   // Evaluate the expression if needed
   TensorForcedEvalOp<const T> eval = expr.eval();
-  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  Evaluator tensor(eval, DefaultDevice());
   tensor.evalSubExprsIfNeeded(NULL);
 
-  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
-  typedef typename T::Index Index;
-  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
-
-  // Print the tensor as a 1d vector or a 2d matrix.
+  // Print the result
   static const int rank = internal::array_size<Dimensions>::value;
-  if (rank == 0) {
-    os << tensor.coeff(0);
-  } else if (rank == 1) {
-    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-    os << array;
-  } else {
-    const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
-    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
-    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-    os << matrix;
-  }
+  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
 
   // Cleanup.
   tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index bafcc67bd..566856ed2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -174,7 +174,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -362,7 +362,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 985594bc8..3209fecd3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,8 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 
 #define EIGEN_HAS_INDEX_LIST
 
@@ -45,6 +46,24 @@ struct type2index {
   }
 };
 
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <DenseIndex f, DenseIndex s>
+struct type2indexpair {
+  static const DenseIndex first = f;
+  static const DenseIndex second = s;
+
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+    return IndexPair<DenseIndex>(f, s);
+  }
+
+  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+    eigen_assert(val.first == f);
+    eigen_assert(val.second == s);
+  }
+};
+
+
 template<DenseIndex n> struct NumTraits<type2index<n> >
 {
   typedef DenseIndex Real;
@@ -72,6 +91,16 @@ EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
   val.set(new_val);
 }
 
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+  val = new_val;
+}
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+  val.set(new_val);
+}
+
+
 template <typename T>
 struct is_compile_time_constant {
   static constexpr bool value = false;
@@ -94,7 +123,22 @@ struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
 
 
 template<typename... T>
@@ -184,31 +228,32 @@ template <typename T, typename... O>
 
 
 
-template <DenseIndex Idx>
+template <DenseIndex Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
-    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
       update_value(array_get<Idx>(t), value);
     } else {
-      tuple_coeff<Idx-1>::set(i, t, value);
+      tuple_coeff<Idx-1, ValueT>::set(i, t, value);
     }
   }
 
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
     return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
-        tuple_coeff<Idx-1>::value_known_statically(i, t);
+        tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
 
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
     return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
-        tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+        tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
   }
 
   template <typename... T>
@@ -216,19 +261,19 @@ struct tuple_coeff {
     return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
            is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
            array_get<Idx>(t) > array_get<Idx-1>(t) &&
-           tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+           tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
   }
 };
 
-template <>
-struct tuple_coeff<0> {
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return array_get<0>(t) * (i == 0);
+    return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
     update_value(array_get<0>(t), value);
   }
@@ -254,13 +299,13 @@ struct tuple_coeff<0> {
 template<typename FirstType, typename... OtherTypes>
 struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
@@ -268,14 +313,14 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
   EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
   }
   EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
   }
 
   EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
@@ -286,6 +331,23 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
 }
 
 
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  }
+};
+
 namespace internal {
 
 template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
@@ -303,6 +365,13 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
   static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
 template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
@@ -472,6 +541,57 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
   }
 };
 
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -482,53 +602,69 @@ namespace internal {
 
 template <typename T>
 struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct all_indices_known_statically_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
 struct indices_statically_known_to_increase_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -572,6 +708,16 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i,
   return index_statically_lt_impl<T>::run(i, value);
 }
 
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index de2f67d74..f391fb9ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 2d223140e..33edc49e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
 
 #include <initializer_list>
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
 namespace internal {
 
 namespace {
+
   // Note: result is undefined if val == 0
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
 #ifdef __CUDA_ARCH__
-    return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+    return __clz(val);
 #elif EIGEN_COMP_MSVC
-	unsigned long index;
-	if (sizeof(T) == 8) {
-      _BitScanReverse64(&index, val);
-    } else {
-      _BitScanReverse(&index, val);
-    }
-    return (sizeof(T) == 8) ? 63 - index : 31 - index;
+    unsigned long index;
+    _BitScanReverse(&index, val);
+    return 31 - index;
 #else
     EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return (sizeof(T) == 8) ?
-      __builtin_clzll(static_cast<uint64_t>(val)) :
-      __builtin_clz(static_cast<uint32_t>(val));
+    return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+    unsigned long index;
+    _BitScanReverse64(&index, val);
+    return 63 - index;
+#elif EIGEN_COMP_MSVC
+    // MSVC's _BitScanReverse64 is not available for 32bits builds.
+    unsigned int lo = (unsigned int)(val&0xffffffff);
+    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+    int n;
+    if(hi==0)
+      n = 32 + count_leading_zeros<unsigned int>(lo);
+    else
+      n = count_leading_zeros<unsigned int>(hi);
+    return n;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clzll(static_cast<uint64_t>(val));
 #endif
   }
 
@@ -98,7 +120,9 @@ namespace {
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
-      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
       return static_cast<uint64_t>(result);
 #endif
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 8ed71f838..ee0078bbc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -28,7 +28,7 @@
 
 // SFINAE requires variadic templates
 #ifndef __CUDACC__
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
     #if EIGEN_GNUC_AT_LEAST(4,8)
@@ -44,7 +44,7 @@
     typename internal::enable_if< ( __condition__ ) , int >::type = 0
 
 
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
 #define EIGEN_CONSTEXPR constexpr
 #else
 #define EIGEN_CONSTEXPR
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 9ebd9172b..6fb4f4a31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -57,7 +57,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
@@ -140,7 +140,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       return m_data[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -227,7 +227,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       return m_data[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index cd04716bd..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
 
 // Default packet types
 template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
   typedef typename internal::packet_traits<Scalar>::type type;
-  enum { size = internal::unpacket_traits<type>::size };
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
 template <>
-struct PacketType<float, GpuDevice> {
-  typedef float4 type;
-  static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
-  typedef double2 type;
+struct PacketType<half, GpuDevice> {
+  typedef half2 type;
   static const int size = 2;
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0,
+
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 1,
+  };
 };
 #endif
 
@@ -112,6 +129,20 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
 }
 
 
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
 
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index bfa65a607..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -409,7 +409,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
     Index inputIndices[] = {0, 0};
@@ -603,6 +603,286 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
 };
 
 
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+  public:
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+    const XprType& expr, const StartIndices& startIndices,
+    const StopIndices& stopIndices, const Strides& strides)
+      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+        m_strides(strides) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_startIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& stopIndices() const { return m_stopIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& strides() const { return m_strides; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_startIndices;
+    const StopIndices m_stopIndices;
+    const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  {
+    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+      if(m_strides[i]>0){
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      }else{
+        /* implies m_strides[i]<0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      }
+      m_startIndices[i] = startIndicesClamped[i];
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // check for degenerate intervals and compute output tensor shape
+    bool degenerate = false;;
+    for(int i = 0; i < NumDims; i++){
+      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+        m_dimensions[i] = 0;
+        degenerate = true;
+      }else{
+        m_dimensions[i] = interval / m_strides[i]
+                          + (interval % m_strides[i] != 0 ? 1 : 0);
+        eigen_assert(m_dimensions[i] >= 0);
+      }
+    }
+    Strides output_dims = m_dimensions;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = m_strides[0];
+      m_offsets[0] = startIndicesClamped[0];
+      Index previousDimProduct = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        previousDimProduct *= input_dims[i-1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+      m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+      Index previousDimProduct = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        previousDimProduct *= input_dims[i+1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    }
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    return inputIndex;
+  }
+
+  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+    return numext::maxi(min, numext::mini(max,value));
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+  const Strides m_strides;
+  std::size_t m_block_total_size_max;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+  : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+};
+
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 88b838b27..647bcf108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
     // of 1 element first and then pad.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     // Compute dimensions
     m_dimensions = m_impl.dimensions();
@@ -150,27 +150,26 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+        if (isPaddingAtIndexForDim(idx, i)) {
           return m_paddingValue;
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+      if (isPaddingAtIndexForDim(index, 0)) {
         return m_paddingValue;
       }
       inputIndex += (index - m_padding[0].first);
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+        if (isPaddingAtIndexForDim(idx, i)) {
           return m_paddingValue;
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i+1];
       }
-      if (index < m_padding[NumDims-1].first ||
-          index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+      if (isPaddingAtIndexForDim(index, NumDims-1)) {
         return m_paddingValue;
       }
       inputIndex += (index - m_padding[NumDims-1].first);
@@ -187,43 +186,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return packetRowMajor(index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    Index inputIndex;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      {
-        const Index idx = coords[0];
-        if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
-          return m_paddingValue;
-        }
-        inputIndex = idx - m_padding[0].first;
-      }
-      for (int i = 1; i < NumDims; ++i) {
-        const Index idx = coords[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-      }
-    } else {
-      {
-        const Index idx = coords[NumDims-1];
-        if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
-          return m_paddingValue;
-        }
-        inputIndex = idx - m_padding[NumDims-1].first;
-      }
-      for (int i = NumDims - 2; i >= 0; --i) {
-        const Index idx = coords[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-      }
-    }
-    return m_impl.coeff(inputIndex);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -239,6 +201,40 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+      Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+        (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+    return (index < m_padding[dim_index].first) ||
+           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+
   void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
     const double in = static_cast<double>(m_impl.dimensions()[i]);
     const double out = in + m_padding[i].first + m_padding[i].second;
@@ -261,7 +257,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
@@ -273,15 +269,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -299,15 +295,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
     const Index lastPaddedRight = m_outputStrides[1];
 
-    if (last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[0].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -318,7 +314,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
@@ -331,15 +327,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -357,15 +353,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
     const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[NumDims-1].first);
       return m_impl.template packet<Unaligned>(inputIndex);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a87e45330..886a254f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..1655a813e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  assert(threadIdx.z == 0);
+  return clock64() +
+      blockIdx.x * blockDim.x + threadIdx.x +
+      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+  // Use the current time as a baseline.
+  SYSTEMTIME st;
+  GetSystemTime(&st);
+  int time = st.wSecond + 1000 * st.wMilliseconds;
+  // Mix in a random number to make sure that we get different seeds if
+  // we try to generate seeds faster than the clock resolution.
+  // We need 2 random values since the generator only generate 16 bits at
+  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+  int rnd1 = ::rand();
+  int rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+  return rnd;
+
+#elif defined __APPLE__
+  // Same approach as for win32, except that the random number generator
+  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+  uint64_t rnd = ::random() ^ mach_absolute_time();
+  return rnd;
+
+#else
+  // Augment the current time with pseudo random number generation
+  // to ensure that we get different seeds if we try to generate seeds
+  // faster than the clock resolution.
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  uint64_t rnd = ::random() ^ ts.tv_nsec;
+  return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+}  // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+  Eigen::half result;
+  // Generate 10 random bits for the mantissa
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+  // Set the exponent
+  result.x |= (static_cast<uint16_t>(15) << 10);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost *
+           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+  } while (q > T(0.27597) &&
+           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+  return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+      const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 885295f0a..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -87,7 +87,7 @@ struct preserve_inner_most_dims {
   static const bool value = false;
 };
 
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
@@ -122,7 +122,7 @@ struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
 template <int DimIndex, typename Self, typename Op>
 struct GenericDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -183,7 +183,7 @@ struct InnerMostDimPreserver {
 template <int DimIndex, typename Self, typename Op>
 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-#ifdef EIGEN_USE_COST_MODEL
     const TensorOpCost cost =
         self.m_impl.costPerCoeff(Vectorizable) +
         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
                      PacketSize);
     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         num_coeffs, cost, device.numThreads());
-#else
-    const int num_threads = device.numThreads();
-#endif
     if (num_threads == 1) {
       *output =
           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -268,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    Barrier barrier(numblocks);
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -320,7 +316,18 @@ struct OuterReducer {
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+
+#endif
 
 template <int NPT, typename S, typename R, typename I>
 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
@@ -396,7 +403,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
-    EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -464,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
-    return total_size > 1024 * 1024;
-#else
-    return true || total_size;
-#endif
-  }
-
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
-    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+    if (RunningFullReduction &&
+        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+         !RunningOnGPU)) {
       bool need_assign = false;
       if (!data) {
         m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
@@ -493,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -506,8 +505,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
         Op reducer(m_reducer);
-        return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
 
       bool preserving_inner_dims = true;
@@ -522,8 +538,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
         Op reducer(m_reducer);
-        return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
     }
     return true;
@@ -533,13 +566,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     m_impl.cleanup();
     if (m_result) {
       m_device.deallocate(m_result);
+      m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (RunningFullReduction && m_result) {
-      return *m_result;
+    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
     }
     Op reducer(m_reducer);
     if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,8 +592,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     if (ReducingInnerMostDims) {
@@ -617,11 +655,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#endif
   template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index fd2587dd5..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,8 +67,41 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
 #endif
 }
 
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
 #if __CUDA_ARCH__ >= 300
   atomicAdd(output, accum);
 #else
@@ -86,16 +119,43 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
   }
 }
 
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+  // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    *output = reducer.initialize();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
   }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
 
   typename Self::CoeffReturnType accum = reducer.initialize();
   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
@@ -108,50 +168,203 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
 
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset), &accum);
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(output, accum, reducer);
   }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
 }
 
 
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
+
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (gridDim.x == 1 && first_index == 0) {
+    if (num_coeffs % 2 != 0) {
+      half last = input.m_impl.coeff(num_coeffs-1);
+      *scratch = __halves2half2(last, reducer.initialize());
+    } else {
+      *scratch = reducer.template initializePacket<half2>();
+    }
+    __syncthreads();
+  }
+
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+
+  if (gridDim.x == 1 && first_index == 0) {
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif
+
+
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
 
   template <typename OutputType>
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
-    assert(false && "Should only be called on floats");
-  }
-
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
-    typedef typename Self::Index Index;
-
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
       return;
     }
 
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         1, 32, 0, device, reducer.initialize(), 1, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
   }
 };
 
@@ -160,6 +373,8 @@ template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -179,6 +394,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
       output[i] = reducer.initialize();
     }
+    __syncthreads();
   }
 
   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
@@ -188,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       const Index col_block = i % input_col_blocks;
       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
 
-      float reduced_val = reducer.initialize();
+      Type reduced_val = reducer.initialize();
 
       for (Index j = 0; j < NumPerThread; j += unroll_times) {
         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
         if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
-            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
             reducer.reduce(val, &reduced_val);
           }
           break;
@@ -217,33 +433,128 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
         atomicReduce(&(output[row]), reduced_val, reducer);
       }
     }
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
 
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
     __syncthreads();
   }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
 }
 
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+#endif
 
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
     return true;
   }
+};
 
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 128;
@@ -259,7 +570,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
       const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / 1024;
       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
                          num_blocks, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
@@ -271,6 +582,85 @@ struct InnerReducer<Self, Op, GpuDevice> {
   }
 };
 
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
@@ -283,6 +673,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
       output[i] = reducer.initialize();
     }
+    __syncthreads();
   }
 
   // Do the reduction.
@@ -307,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats.
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
     return true;
   }
 
@@ -323,7 +714,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
       return true;
     }
 
-     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 16;
     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index bc92d9e6d..99245f778 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -193,7 +193,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       return m_evaluator->coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 1a59cc8f7..14e392e36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       : m_impl(op.expression(), device), m_reverse(op.reverse())
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     // Compute strides
     m_dimensions = m_impl.dimensions();
@@ -195,7 +195,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // TODO(ndjaitly): write a better packing routine that uses
@@ -269,7 +269,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x) {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 000000000..8501466ce
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+    : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+  typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+            typename eval<TensorScanOp<Op, XprType> >::type>
+{
+  typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor scan class.
+  */
+template <typename Op, typename XprType>
+class TensorScanOp
+    : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+      const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Index axis() const { return m_axis; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool exclusive() const { return m_exclusive; }
+
+protected:
+  typename XprType::Nested m_expr;
+  const Index m_axis;
+  const Op m_accumulator;
+  const bool m_exclusive;
+};
+
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_output(NULL) {
+
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      for (int i = NumDims - 1; i > op.axis(); --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  {
+    return m_output;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_output[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output != NULL) {
+      m_device.deallocate(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType *data) {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+        // Calculate the starting offset for the scan
+        Index offset = idx1 + idx2;
+
+        // Compute the scan along the axis, starting at the calculated offset
+        typename Self::CoeffReturnType accum = self.accumulator().initialize();
+        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+          Index curr = offset + idx3 * self.stride();
+
+          if (self.exclusive()) {
+            data[curr] = self.accumulator().finalize(accum);
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+          } else {
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+            data[curr] = self.accumulator().finalize(accum);
+          }
+        }
+      }
+    }
+  }
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && __CUDACC__
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e76533710..113c060e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -166,7 +166,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -248,7 +248,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   template <int StoreMode> EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0e89033c4..f8121d17b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -85,7 +85,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
         : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
       { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template <typename... DenseIndex>
     EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
       m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 52b7d216a..6c35bfdb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
@@ -289,7 +289,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 5950f38e2..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
     eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
     eigen_assert(x >= 0);
   }
 
@@ -74,21 +75,21 @@ struct TensorUInt128
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   return (lhs.high == rhs.high) & (lhs.low == rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   return (lhs.high != rhs.high) | (lhs.low != rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high > rhs.high;
@@ -98,7 +99,7 @@ static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<H
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high < rhs.high;
@@ -108,7 +109,7 @@ static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
   if (result.low < rhs.low) {
@@ -119,7 +120,7 @@ static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
   if (result.low > lhs.low) {
@@ -130,8 +131,8 @@ static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>
 
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   // Split each 128-bit integer into 4 32-bit integers, and then do the
   // multiplications by hand as follow:
@@ -205,8 +206,8 @@ static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
     return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index e735fc76f..0ca2cac84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -187,7 +187,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
-  )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_ThreadPool_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
     eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
     // Initialize epoch to something close to overflow to test overflow.
     state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -169,7 +169,8 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    std::atomic<Waiter*> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
     std::mutex mu;
     std::condition_variable cv;
     uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
       kWaiting,
       kSignaled,
     };
-    // Prevent false sharing with other Waiter objects in the same vector.
-    char pad_[128];
   };
 
  private:
@@ -200,7 +199,7 @@ class EventCount {
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
-  std::vector<Waiter>& waiters_;
+  MaxSizeVector<Waiter>& waiters_;
 
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 1c471a19f..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -23,18 +23,44 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       : env_(env),
         threads_(num_threads),
         queues_(num_threads),
+        coprimes_(num_threads),
         waiters_(num_threads),
-        blocked_(),
-        spinning_(),
-        done_(),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
         ec_(waiters_) {
-    for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
-    for (int i = 0; i < num_threads; i++)
+    waiters_.resize(num_threads);
+
+    // Calculate coprimes of num_threads.
+    // Coprimes are used for a random walk over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (int i = 1; i <= num_threads; i++) {
+      unsigned a = i;
+      unsigned b = num_threads;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
+    }
+    for (int i = 0; i < num_threads; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads; i++) {
       threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
   }
 
   ~NonBlockingThreadPoolTempl() {
-    done_.store(true, std::memory_order_relaxed);
+    done_ = true;
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
@@ -50,7 +76,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->index];
+      Queue* q = queues_[pt->thread_id];
       t = q->PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
@@ -71,108 +97,111 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       env_.ExecuteTask(t);  // Push failed, execute directly.
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt =
+        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  private:
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    bool inited;
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
     NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    unsigned index;         // Worker thread index in pool.
-    unsigned rand;          // Random generator state.
+    uint64_t rand;  // Random generator state.
+    int thread_id;  // Worker thread index in pool.
   };
 
   Environment env_;
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
-  std::vector<EventCount::Waiter> waiters_;
+  MaxSizeVector<unsigned> coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
   EventCount ec_;
 
   // Main worker thread loop.
-  void WorkerLoop(unsigned index) {
+  void WorkerLoop(int thread_id) {
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->index = index;
-    Queue* q = queues_[index];
-    EventCount::Waiter* waiter = &waiters_[index];
-    std::vector<Task> stolen;
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->thread_id = thread_id;
+    Queue* q = queues_[thread_id];
+    EventCount::Waiter* waiter = &waiters_[thread_id];
     for (;;) {
-      Task t;
-      if (!stolen.empty()) {
-        t = std::move(stolen.back());
-        stolen.pop_back();
-      }
-      if (!t.f) t = q->PopFront();
+      Task t = q->PopFront();
       if (!t.f) {
-        if (Steal(&stolen)) {
-          t = std::move(stolen.back());
-          stolen.pop_back();
-          while (stolen.size()) {
-            Task t1 = q->PushFront(std::move(stolen.back()));
-            stolen.pop_back();
-            if (t1.f) {
-              // There is not much we can do in this case. Just execute the
-              // remaining directly.
-              stolen.push_back(std::move(t1));
-              break;
+        t = Steal();
+        if (!t.f) {
+          // Leave one thread spinning. This reduces latency.
+          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+          // Also, the time it takes to attempt to steal work 1000 times depends
+          // on the size of the thread pool. However the speed at which the user
+          // of the thread pool submit tasks is independent of the size of the
+          // pool. Consider a time based limit instead.
+          if (!spinning_ && !spinning_.exchange(true)) {
+            for (int i = 0; i < 1000 && !t.f; i++) {
+              t = Steal();
+            }
+            spinning_ = false;
+          }
+          if (!t.f) {
+            if (!WaitForWork(waiter, &t)) {
+              return;
             }
           }
         }
       }
       if (t.f) {
         env_.ExecuteTask(t);
-        continue;
       }
-      // Leave one thread spinning. This reduces latency.
-      if (!spinning_ && !spinning_.exchange(true)) {
-        bool nowork = true;
-        for (int i = 0; i < 1000; i++) {
-          if (!OutOfWork()) {
-            nowork = false;
-            break;
-          }
-        }
-        spinning_ = false;
-        if (!nowork) continue;
-      }
-      if (!WaitForWork(waiter)) return;
     }
   }
 
   // Steal tries to steal work from other worker threads in best-effort manner.
-  bool Steal(std::vector<Task>* stolen) {
-    if (queues_.size() == 1) return false;
+  Task Steal() {
     PerThread* pt = GetPerThread();
-    unsigned lastq = pt->index;
-    for (unsigned i = queues_.size(); i > 0; i--) {
-      unsigned victim = Rand(&pt->rand) % queues_.size();
-      if (victim == lastq && queues_.size() > 2) {
-        i++;
-        continue;
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      Task t = queues_[victim]->PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
       }
-      // Steal half of elements from a victim queue.
-      // It is typical to steal just one element, but that assumes that work is
-      // recursively subdivided in halves so that the stolen element is exactly
-      // half of work. If work elements are equally-sized, then is makes sense
-      // to steal half of elements at once and then work locally for a while.
-      if (queues_[victim]->PopBackHalf(stolen)) return true;
-      lastq = victim;
     }
-    // Just to make sure that we did not miss anything.
-    for (unsigned i = queues_.size(); i > 0; i--)
-      if (queues_[i - 1]->PopBackHalf(stolen)) return true;
-    return false;
+    return Task();
   }
 
-  // WaitForWork blocks until new work is available, or if it is time to exit.
-  bool WaitForWork(EventCount::Waiter* waiter) {
-    // We already did best-effort emptiness check in Steal, so prepare blocking.
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
     ec_.Prewait(waiter);
-    // Now do reliable emptiness check.
-    if (!OutOfWork()) {
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
       ec_.CancelWait(waiter);
+      *t = queues_[victim]->PopBack();
       return true;
     }
     // Number of blocked threads is used as termination condition.
@@ -186,7 +215,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       // right after incrementing blocked_ above. Now a free-standing thread
       // submits work and calls destructor (which sets done_). If we don't
       // re-check queues, we will exit leaving the work unexecuted.
-      if (!OutOfWork()) {
+      if (NonEmptyQueueIndex() != -1) {
         // Note: we must not pop from queues before we decrement blocked_,
         // otherwise the following scenario is possible. Consider that instead
         // of checking for emptiness we popped the only element from queues.
@@ -205,23 +234,36 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return true;
   }
 
-  bool OutOfWork() {
-    for (unsigned i = 0; i < queues_.size(); i++)
-      if (!queues_[i]->Empty()) return false;
-    return true;
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!queues_[victim]->Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
   }
 
-  PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
-    if (pt->inited) return pt;
-    pt->inited = true;
-    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
     return pt;
   }
 
-  static unsigned Rand(unsigned* state) {
-    return *state = *state * 1103515245 + 12345;
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 0544a6e15..05ed76cbe 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -38,7 +38,7 @@ namespace Eigen {
 template <typename Work, unsigned kSize>
 class RunQueue {
  public:
-  RunQueue() : front_(), back_() {
+  RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
     eigen_assert((kSize & (kSize - 1)) == 0);
     eigen_assert(kSize > 2);            // why would you do this?
@@ -100,7 +100,7 @@ class RunQueue {
   // PopBack removes and returns the last elements in the queue.
   // Can fail spuriously.
   Work PopBack() {
-    if (Empty()) return 0;
+    if (Empty()) return Work();
     std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
     if (!lock) return Work();
     unsigned back = back_.load(std::memory_order_relaxed);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
   explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
       : env_(env), threads_(num_threads), waiters_(num_threads) {
     for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
     }
   }
 
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
 
   // Schedule fn() for execution in the pool of threads. The functions are
   // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
+  void Schedule(std::function<void()> fn) final {
     Task t = env_.CreateTask(std::move(fn));
     std::unique_lock<std::mutex> l(mu_);
     if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     }
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = this->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  protected:
-  void WorkerLoop() {
+  void WorkerLoop(int thread_id) {
     std::unique_lock<std::mutex> l(mu_);
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->thread_id = thread_id;
     Waiter w;
     Task t;
     while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     bool ready;
   };
 
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), thread_id(-1) { }
+    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    int thread_id;                // Worker thread index in pool.
+  };
+
   Environment env_;
   std::mutex mu_;
   MaxSizeVector<Thread*> threads_;  // All threads
   MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;          // Queue of pending work
-  std::condition_variable empty_;          // Signaled on pending_.empty()
+  std::deque<Task> pending_;        // Queue of pending work
+  std::condition_variable empty_;   // Signaled on pending_.empty()
   bool exiting_ = false;
+
+  PerThread* GetPerThread() const {
+    EIGEN_THREAD_LOCAL PerThread per_thread;
+    return &per_thread;
+  }
 };
 
 typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
   // destructor must join the thread.
   class EnvThread {
    public:
-    EnvThread(std::function<void()> f) : thr_(f) {}
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
 
    private:
     std::thread thr_;
   };
 
-  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
   Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
   void ExecuteTask(const Task& t) { t.f(); }
 };
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
  public:
   virtual void Schedule(std::function<void()> fn) = 0;
 
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
   virtual ~ThreadPoolInterface() {}
 };
 
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 24159e54c..30d3ebcff 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -117,7 +117,7 @@ template <typename T, size_t n> class array {
     values[7] = v8;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
     eigen_assert(l.size() == n);
@@ -167,7 +167,7 @@ template <typename T> class array<T, 0> {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array() : dummy() { }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
     eigen_assert(l.size() == 0);
   }
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
     internal::aligned_free(data_);
   }
 
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (size_t i = size_; i < n; ++i) {
+      new (&data_[i]) T;
+    }
+    for (size_t i = n; i < size_; ++i) {
+      data_[i].~T();
+    }
+    size_ = n;
+  }
+
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
new file mode 100644
index 000000000..521fa3f76
--- /dev/null
+++ b/unsupported/Eigen/EulerAngles
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "Eigen/Core"
+#include "Eigen/Geometry"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup EulerAngles_Module EulerAngles module
+  * \brief This module provides generic euler angles rotation.
+  *
+  * Euler angles are a way to represent 3D rotation.
+  *
+  * In order to use this module in your code, include this header:
+  * \code
+  * #include <unsupported/Eigen/EulerAngles>
+  * \endcode
+  *
+  * See \ref EulerAngles for more information.
+  *
+  */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 89036886b..7f0b70c63 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -67,27 +67,32 @@ int main()
       IsSigned = 1,
       IsComplex = 0,
       RequireInitialization = 1,
-      ReadCost = 10,
-      AddCost = 10,
-      MulCost = 40
+      ReadCost = HugeCost,
+      AddCost  = HugeCost,
+      MulCost  = HugeCost
     };
 
     typedef mpfr::mpreal Real;
     typedef mpfr::mpreal NonInteger;
     
-    inline static Real highest   (long Precision = mpfr::mpreal::get_default_prec())  { return  mpfr::maxval(Precision); }
-    inline static Real lowest    (long Precision = mpfr::mpreal::get_default_prec())  { return -mpfr::maxval(Precision); }
+    static inline Real highest  (long Precision = mpfr::mpreal::get_default_prec()) { return  mpfr::maxval(Precision); }
+    static inline Real lowest   (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
 
     // Constants
-    inline static Real Pi       (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_pi(Precision);        }
-    inline static Real Euler    (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_euler(Precision);     }
-    inline static Real Log2     (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_log2(Precision);      }
-    inline static Real Catalan  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_catalan(Precision);   }
+    static inline Real Pi      (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_pi(Precision);        }
+    static inline Real Euler   (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_euler(Precision);     }
+    static inline Real Log2    (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_log2(Precision);      }
+    static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_catalan(Precision);   }
 
-    inline static Real epsilon  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::machine_epsilon(Precision); }
-    inline static Real epsilon  (const Real& x)                                         {    return mpfr::machine_epsilon(x); }
+    static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::machine_epsilon(Precision); }
+    static inline Real epsilon (const Real& x)                                      { return mpfr::machine_epsilon(x); }
 
-    inline static Real dummy_precision()   
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+    static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
+    static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+#endif
+
+    static inline Real dummy_precision()
     {
       mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
       return mpfr::machine_epsilon(weak_prec);
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 000000000..7c7493c56
--- /dev/null
+++ b/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup SpecialFunctions_Module Special math functions module
+  *
+  * This module features additional coefficient-wise math functions available
+  * within the numext:: namespace for the scalar version, and as method and/or free
+  * functions of Array. Those include:
+  *
+  * - erf
+  * - erfc
+  * - lgamma
+  * - igamma
+  * - igammac
+  * - digamma
+  * - polygamma
+  * - zeta
+  * - betainc
+  *
+  * \code
+  * #include <unsupported/Eigen/SpecialFunctions>
+  * \endcode
+  */
+//@{
+
+}
+
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index 1a61e3367..33b6c393f 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -20,37 +20,60 @@ public:
   AutoDiffJacobian(const Functor& f) : Functor(f) {}
 
   // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... T>
+  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
   template<typename T0>
   AutoDiffJacobian(const T0& a0) : Functor(a0) {}
   template<typename T0, typename T1>
   AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
   template<typename T0, typename T1, typename T2>
   AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
-
-  enum {
-    InputsAtCompileTime = Functor::InputsAtCompileTime,
-    ValuesAtCompileTime = Functor::ValuesAtCompileTime
-  };
+#endif
 
   typedef typename Functor::InputType InputType;
   typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
-  typedef typename JacobianType::Scalar Scalar;
+  typedef typename ValueType::Scalar Scalar;
+
+  enum {
+    InputsAtCompileTime = InputType::RowsAtCompileTime,
+    ValuesAtCompileTime = ValueType::RowsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
   typedef typename JacobianType::Index Index;
 
-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
   typedef AutoDiffScalar<DerivativeType> ActiveScalar;
 
-
   typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE
+  void operator() (const InputType& x, ValueType* v) const
+  {
+      this->operator()(x, v, 0);
+  }
+  template<typename... ParamsType>
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+                   const ParamsType&... Params) const
+#else
   void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
   {
     eigen_assert(v!=0);
+
     if (!_jac)
     {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+      Functor::operator()(x, v, Params...);
+#else
       Functor::operator()(x, v);
+#endif
       return;
     }
 
@@ -61,12 +84,16 @@ public:
 
     if(InputsAtCompileTime==Dynamic)
       for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+        av[j].derivatives().resize(x.rows());
 
     for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    Functor::operator()(ax, &av, Params...);
+#else
     Functor::operator()(ax, &av);
+#endif
 
     for (Index i=0; i<jac.rows(); i++)
     {
@@ -74,8 +101,6 @@ public:
       jac.row(i) = av[i].derivatives();
     }
   }
-protected:
-
 };
 
 }
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 481dfa91a..50fedf6ac 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
+template<typename _DerType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+  return AutoDiffScalar<NewDerType>(value,der);
+}
+
 /** \class AutoDiffScalar
   * \brief A scalar type replacement with automatic differentation capability
   *
@@ -60,7 +67,7 @@ template<typename _DerType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
             <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                        typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
@@ -101,7 +108,7 @@ class AutoDiffScalar
     template<typename OtherDerType>
     AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    , typename internal::enable_if<internal::is_same<Scalar,typename OtherDerType::Scalar>::value,void*>::type = 0
+    , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
 #endif
     )
       : m_value(other.value()), m_derivatives(other.derivatives())
@@ -257,20 +264,16 @@ class AutoDiffScalar
         -m_derivatives);
     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value * other,
-        (m_derivatives * other));
+      return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        a.value() * other,
-        a.derivatives() * other);
+      return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -289,20 +292,16 @@ class AutoDiffScalar
 //         a.derivatives() * other);
 //     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value / other,
-        (m_derivatives * (Scalar(1)/other)));
+      return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        other / a.value(),
-        a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+      return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -322,34 +321,29 @@ class AutoDiffScalar
 //     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+        CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
     operator/(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+      return MakeAutoDiffScalar(
         m_value / other.value(),
-          ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+          ((m_derivatives * other.value()) - (other.derivatives() * m_value))
         * (Scalar(1)/(other.value()*other.value())));
     }
 
     template<typename OtherDerType>
     inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
     operator*(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+      return MakeAutoDiffScalar(
         m_value * other.value(),
-        (m_derivatives * other.value()) + (m_value * other.derivatives()));
+        (m_derivatives * other.value()) + (other.derivatives() * m_value));
     }
 
     inline AutoDiffScalar& operator*=(const Scalar& other)
@@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true>
   }
 
 
-  inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
   operator*(const Real& other) const
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
       derived().value() * other,
       derived().derivatives() * other);
   }
 
-  friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
   operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
@@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
   }
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
-{
-  enum { Defined = 1 };
-  typedef AutoDiffScalar<DerType> ReturnType;
-};
-
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
-{
-  enum { Defined = 1 };
-  typedef AutoDiffScalar<DerType> ReturnType;
-};
-
 } // end namespace internal
 
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
+{
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
+{
+  typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+//   enum { Defined = 1 };
+//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
+
 #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
   template<typename DerType> \
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+  inline const Eigen::AutoDiffScalar< \
+  EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
   FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
     using namespace Eigen; \
-    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
-    typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+    EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
     CODE; \
   }
 
@@ -548,56 +543,75 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  {
 template<typename DerType>
 inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x <= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x <= y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x >= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x >= y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x < y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x < y ? ADS(x) : ADS(y));
+}
 template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x > y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+  return (x > y ? ADS(x) : ADS(y));
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() >= y.value() ? x : y);
+}
+
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
   using std::abs;
-  return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+  return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
   using numext::abs2;
-  return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+  return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
   using std::sqrt;
   Scalar sqrtx = sqrt(x.value());
-  return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+  return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
   using std::cos;
   using std::sin;
-  return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+  return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
   using std::sin;
   using std::cos;
-  return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+  return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
   using std::exp;
   Scalar expx = exp(x.value());
-  return ReturnType(expx,x.derivatives() * expx);)
+  return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
   using std::log;
-  return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+  return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
 
 template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar y)
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
 {
   using namespace Eigen;
-  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
-  typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar;
-  return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >(
-    std::pow(x.value(),y),
-    x.derivatives() * (y * std::pow(x.value(),y-1)));
+  using std::pow;
+  return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
 }
 
 
@@ -622,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
   using std::tan;
   using std::cos;
-  return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
   using std::sqrt;
   using std::asin;
-  return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
   
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
   using std::sqrt;
   using std::acos;
-  return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+  using std::cosh;
+  using std::tanh;
+  return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
 
 #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
 
 template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
-  : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+  : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
 {
-  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime,
-                                DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real;
+  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+                                0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
   typedef AutoDiffScalar<DerType> NonInteger;
   typedef AutoDiffScalar<DerType> Nested;
+  typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
   enum{
     RequireInitialization = 1
   };
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_AutoDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_BVH_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index a7e8c7553..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(Eigenvalues)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(Splines)
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 3b6a69aff..866a8a460 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -628,15 +628,15 @@ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
       m_info = Success;
     }
 
-    delete select;
+    delete[] select;
   }
 
-  delete v;
-  delete iparam;
-  delete ipntr;
-  delete workd;
-  delete workl;
-  delete resid;
+  delete[] v;
+  delete[] iparam;
+  delete[] ipntr;
+  delete[] workd;
+  delete[] workl;
+  delete[] resid;
 
   m_isInitialized = true;
 
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 1d4387c82..000000000
--- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigenvalues_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 000000000..40af550e8
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_EulerAngles_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 000000000..13a0da1ab
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+  /*template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+  struct ei_eulerangles_assign_impl;*/
+
+  /** \class EulerAngles
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+    *
+    * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+    * 
+    * Here is how intrinsic Euler angles works:
+    *  - first, rotate the axes system over the alpha axis in angle alpha
+    *  - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+    *  - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+    *
+    * \note This class support only intrinsic Euler angles for simplicity,
+    *  see EulerSystem how to easily overcome this for extrinsic systems.
+    *
+    * ### Rotation representation and conversions ###
+    *
+    * It has been proved(see Wikipedia link below) that every rotation can be represented
+    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+    * Therefore, you can convert from Eigen rotation and to them
+    *  (including rotation matrices, which is not called "rotations" by Eigen design).
+    *
+    * Euler angles usually used for:
+    *  - convenient human representation of rotation, especially in interactive GUI.
+    *  - gimbal systems and robotics
+    *  - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+    *
+    * However, Euler angles are slow comparing to quaternion or matrices,
+    *  because their unnatural math definition, although it's simple for human.
+    * To overcome this, this class provide easy movement from the math friendly representation
+    *  to the human friendly representation, and vise-versa.
+    *
+    * All the user need to do is a safe simple C++ type conversion,
+    *  and this class take care for the math.
+    * Additionally, some axes related computation is done in compile time.
+    *
+    * #### Euler angles ranges in conversions ####
+    *
+    * When converting some rotation to Euler angles, there are some ways you can guarantee
+    *  the Euler angles ranges.
+    *
+    * #### implicit ranges ####
+    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+    *  unless you convert from some other Euler angles.
+    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
+    * \sa EulerAngles(const MatrixBase<Derived>&)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&)
+    *
+    * #### explicit ranges ####
+    * When using explicit ranges, all angles are guarantee to be in the range you choose.
+    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+    * - _true_ - force the range between [0, +2*PI]
+    * - _false_ - force the range between [-PI, +PI]
+    *
+    * ##### compile time ranges #####
+    * This is when you have compile time ranges and you prefer to
+    *  use template parameter. (e.g. for performance)
+    * \sa FromRotation()
+    *
+    * ##### run-time time ranges #####
+    * Run-time ranges are also supported.
+    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerAngles exist for float and double scalar,
+    *  in a form of EulerAngles{A}{B}{C}{scalar},
+    *  e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+    *
+    * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+    * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+    *  a word that represent what you need.
+    *
+    * ### Example ###
+    *
+    * \include EulerAngles.cpp
+    * Output: \verbinclude EulerAngles.out
+    *
+    * ### Additional reading ###
+    *
+    * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _Scalar the scalar type, i.e., the type of the angles.
+    *
+    * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+    */
+  template <typename _Scalar, class _System>
+  class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+  {
+    public:
+      /** the scalar type of the angles */
+      typedef _Scalar Scalar;
+      
+      /** the EulerSystem to use, which represents the axes of rotation. */
+      typedef _System System;
+    
+      typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+      typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+      typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+      typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+      
+      /** \returns the axis vector of the first (alpha) rotation */
+      static Vector3 AlphaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+        return System::IsAlphaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the second (beta) rotation */
+      static Vector3 BetaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+        return System::IsBetaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the third (gamma) rotation */
+      static Vector3 GammaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+        return System::IsGammaOpposite ? -u : u;
+      }
+
+    private:
+      Vector3 m_angles;
+
+    public:
+      /** Default constructor without initialization. */
+      EulerAngles() {}
+      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+      EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+        m_angles(alpha, beta, gamma) {}
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
+        *
+        * \note All angles will be in the range [-PI, PI].
+      */
+      template<typename Derived>
+      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const MatrixBase<Derived>& m,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot.
+        *
+        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+        *  (Use other functions here for enforcing range if this effect is desired)
+      */
+      template<typename Derived>
+      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const RotationBase<Derived, 3>& rot,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+
+      /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+      const Vector3& angles() const { return m_angles; }
+      /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+      Vector3& angles() { return m_angles; }
+
+      /** \returns The value of the first angle. */
+      Scalar alpha() const { return m_angles[0]; }
+      /** \returns A read-write reference to the angle of the first angle. */
+      Scalar& alpha() { return m_angles[0]; }
+
+      /** \returns The value of the second angle. */
+      Scalar beta() const { return m_angles[1]; }
+      /** \returns A read-write reference to the angle of the second angle. */
+      Scalar& beta() { return m_angles[1]; }
+
+      /** \returns The value of the third angle. */
+      Scalar gamma() const { return m_angles[2]; }
+      /** \returns A read-write reference to the angle of the third angle. */
+      Scalar& gamma() { return m_angles[2]; }
+
+      /** \returns The Euler angles rotation inverse (which is as same as the negative),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles inverse() const
+      {
+        EulerAngles res;
+        res.m_angles = -m_angles;
+        return res;
+      }
+
+      /** \returns The Euler angles rotation negative (which is as same as the inverse),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles operator -() const
+      {
+        return inverse();
+      }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
+      {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        EulerAngles e;
+        System::template CalcEulerAngles<
+          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+        return e;
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+      {
+        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+      }
+      
+      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
+      
+      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+      template<typename Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& m) {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        System::CalcEulerAngles(*this, m);
+        return *this;
+      }
+
+      // TODO: Assign and construct from another EulerAngles (with different system)
+      
+      /** Set \c *this from a rotation. */
+      template<typename Derived>
+      EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+        System::CalcEulerAngles(*this, rot.toRotationMatrix());
+        return *this;
+      }
+      
+      // TODO: Support isApprox function
+
+      /** \returns an equivalent 3x3 rotation matrix. */
+      Matrix3 toRotationMatrix() const
+      {
+        return static_cast<QuaternionType>(*this).toRotationMatrix();
+      }
+
+      /** Convert the Euler angles to quaternion. */
+      operator QuaternionType() const
+      {
+        return
+          AngleAxisType(alpha(), AlphaAxisVector()) *
+          AngleAxisType(beta(), BetaAxisVector())   *
+          AngleAxisType(gamma(), GammaAxisVector());
+      }
+      
+      friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+      {
+        s << eulerAngles.angles().transpose();
+        return s;
+      }
+  };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+  namespace internal
+  {
+    template<typename _Scalar, class _System>
+    struct traits<EulerAngles<_Scalar, _System> >
+    {
+      typedef _Scalar Scalar;
+    };
+  }
+  
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 000000000..98f9f647d
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,326 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+  // Forward declerations
+  template <typename _Scalar, class _System>
+  class EulerAngles;
+  
+  namespace internal
+  {
+    // TODO: Check if already exists on the rest API
+    template <int Num, bool IsPositive = (Num > 0)>
+    struct Abs
+    {
+      enum { value = Num };
+    };
+  
+    template <int Num>
+    struct Abs<Num, false>
+    {
+      enum { value = -Num };
+    };
+
+    template <int Axis>
+    struct IsValidAxis
+    {
+      enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+    };
+  }
+  
+  #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+  
+  /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * Values here represent:
+    *  - The axis of the rotation: X, Y or Z.
+    *  - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+    *
+    * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+    *
+    * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+    */
+  enum EulerAxis
+  {
+    EULER_X = 1, /*!< the X axis */
+    EULER_Y = 2, /*!< the Y axis */
+    EULER_Z = 3  /*!< the Z axis */
+  };
+  
+  /** \class EulerSystem
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a fixed Euler rotation system.
+    *
+    * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+    *
+    * You can use this class to get two things:
+    *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+    *
+    * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+    * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+    *
+    * ### Types of Euler systems ###
+    *
+    * All and only valid 3 dimension Euler rotation over standard
+    *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+    *  - all axes X, Y, Z in each valid order (see below what order is valid)
+    *  - rotation over the axis is supported both over the positive and negative directions.
+    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+    *
+    * Since EulerSystem support both positive and negative directions,
+    *  you may call this rotation distinction in other names:
+    *  - _right handed_ or _left handed_
+    *  - _counterclockwise_ or _clockwise_
+    *
+    * Notice all axed combination are valid, and would trigger a static assertion.
+    * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+    * This yield two and only two classes:
+    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+    *     and the second is different, e.g. {X,Y,X}
+    *
+    * ### Intrinsic vs extrinsic Euler systems ###
+    *
+    * Only intrinsic Euler systems are supported for simplicity.
+    *  If you want to use extrinsic Euler systems,
+    *   just use the equal intrinsic opposite order for axes and angles.
+    *  I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+    *  in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+    *
+    * ### Additional reading ###
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _AlphaAxis the first fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the second fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the third fixed EulerAxis
+    */
+  template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+  class EulerSystem
+  {
+    public:
+    // It's defined this way and not as enum, because I think
+    //  that enum is not guerantee to support negative numbers
+    
+    /** The first rotation axis */
+    static const int AlphaAxis = _AlphaAxis;
+    
+    /** The second rotation axis */
+    static const int BetaAxis = _BetaAxis;
+    
+    /** The third rotation axis */
+    static const int GammaAxis = _GammaAxis;
+
+    enum
+    {
+      AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+      BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+      GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+      
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+      
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+    };
+    
+    private:
+    
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+      ALPHA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+      BETA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+      GAMMA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+      ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+      BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+    enum
+    {
+      // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
+      // They are used in this class converters.
+      // They are always different from each other, and their possible values are: 0, 1, or 2.
+      I = AlphaAxisAbs - 1,
+      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    };
+    
+    // TODO: Get @mat parameter in form that avoids double evaluation.
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+      
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,K), mat(K,K));
+      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        res[1] = atan2(-mat(I,K), -c2);
+      }
+      else
+        res[1] = atan2(-mat(I,K), c2);
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
+    }
+
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,I), mat(K,I));
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+      {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = -atan2(s2, mat(I,I));
+      }
+      else
+      {
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = atan2(s2, mat(I,I));
+      }
+
+      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+      // we can compute their respective rotation, and apply its inverse to M. Since the result must
+      // be a rotation around x, we have:
+      //
+      //  c2  s1.s2 c1.s2                   1  0   0 
+      //  0   c1    -s1       *    M    =   0  c3  s3
+      //  -s2 s1.c2 c1.c2                   0 -s3  c3
+      //
+      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, false, false, false);
+    }
+    
+    template<
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma,
+      typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma)
+    {
+      CalcEulerAngles_imp(
+        res.angles(), mat,
+        typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+      if (IsAlphaOpposite == IsOdd)
+        res.alpha() = -res.alpha();
+        
+      if (IsBetaOpposite == IsOdd)
+        res.beta() = -res.beta();
+        
+      if (IsGammaOpposite == IsOdd)
+        res.gamma() = -res.gamma();
+      
+      // Saturate results to the requested range
+      if (PositiveRangeAlpha && (res.alpha() < 0))
+        res.alpha() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeBeta && (res.beta() < 0))
+        res.beta() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeGamma && (res.gamma() < 0))
+        res.gamma() += Scalar(2 * EIGEN_PI);
+    }
+    
+    template <typename _Scalar, class _System>
+    friend class Eigen::EulerAngles;
+  };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_FFT_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index fbe21fc7e..5a82b0df6 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -62,7 +62,7 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
   typedef Matrix < Scalar, Dynamic, 1 > VectorType;
-  typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
+  typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
 
   RealScalar tol = tol_error;
   const Index maxIters = iters;
@@ -157,7 +157,8 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
     // insert coefficients into upper matrix triangle
     H.col(k-1).head(k) = v.head(k);
 
-    bool stop = (k==m || abs(w(k)) < tol * r0Norm || iters == maxIters);
+    tol_error = abs(w(k)) / r0Norm;
+    bool stop = (k==m || tol_error < tol || iters == maxIters);
 
     if (stop || k == restart)
     {
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_KroneckerProduct_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 4d3e5358e..582fa8512 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> >
 {
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
   typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
@@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
   typedef MatrixXpr XprKind;
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
   typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
   typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
@@ -239,7 +239,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
     Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit,
+          | EvalBeforeNestingBit,
     CoeffReadCost = HugeCost
   };
 
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LevenbergMarquardt_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index b30e0a90a..995427978 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -304,7 +304,7 @@ LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType  &x)
 //     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
     if (!m_useExternalScaling)
         m_diag.resize(n);
-    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
     m_qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MatrixFunctions_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index bbb7e5776..4bb1852b6 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -65,7 +65,7 @@ template <typename MatrixType>
 void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
 {
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
-  const RealScalar b[] = {120., 60., 12., 1.};
+  const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
   U.noalias() = A * tmp;
@@ -81,7 +81,7 @@ template <typename MatrixType>
 void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
 {
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
-  const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.};
+  const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType A4 = A2 * A2;
   const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
@@ -98,7 +98,7 @@ template <typename MatrixType>
 void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
 {
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
-  const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.};
+  const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType A4 = A2 * A2;
   const MatrixType A6 = A4 * A2;
@@ -118,8 +118,8 @@ template <typename MatrixType>
 void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
 {
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
-  const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240.,
-                          2162160., 110880., 3960., 90., 1.};
+  const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+                          2162160.L, 110880.L, 3960.L, 90.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType A4 = A2 * A2;
   const MatrixType A6 = A4 * A2;
@@ -139,9 +139,9 @@ template <typename MatrixType>
 void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
 {
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
-  const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600.,
-                          1187353796428800., 129060195264000., 10559470521600., 670442572800.,
-                          33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.};
+  const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
+                          1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
+                          33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType A4 = A2 * A2;
   const MatrixType A6 = A4 * A2;
@@ -210,9 +210,9 @@ struct matrix_exp_computeUV<MatrixType, float>
     using std::pow;
     const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
     squarings = 0;
-    if (l1norm < 4.258730016922831e-001) {
+    if (l1norm < 4.258730016922831e-001f) {
       matrix_exp_pade3(arg, U, V);
-    } else if (l1norm < 1.880152677804762e+000) {
+    } else if (l1norm < 1.880152677804762e+000f) {
       matrix_exp_pade5(arg, U, V);
     } else {
       const float maxnorm = 3.925724783138660f;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 8f7a6f3b0..db2449d02 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -132,6 +132,7 @@ template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
   typedef typename EivalsType::Index Index;
+  typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
     typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
@@ -145,7 +146,7 @@ void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<C
 
     // Look for other element to add to the set
     for (Index j=i+1; j<eivals.rows(); ++j) {
-      if (abs(eivals(j) - eivals(i)) <= matrix_function_separation
+      if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
           && std::find(qi->begin(), qi->end(), j) == qi->end()) {
         typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
         if (qj == clusters.end()) {
@@ -403,11 +404,10 @@ struct matrix_function_compute<MatrixType, 0>
     typedef internal::traits<MatrixType> Traits;
     typedef typename Traits::Scalar Scalar;
     static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
-    static const int Options = MatrixType::Options;
     static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
 
     typedef std::complex<Scalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix;
+    typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
 
     ComplexMatrix CA = A.template cast<ComplexScalar>();
     ComplexMatrix Cresult;
@@ -508,9 +508,8 @@ template<typename Derived> class MatrixFunctionReturnValue
       typedef internal::traits<NestedEvalTypeClean> Traits;
       static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
       static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-      static const int Options = NestedEvalTypeClean::Options;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
 
       typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index e43e86e90..1acfbed9e 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -37,6 +37,7 @@ template <typename MatrixType>
 void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
 {
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   using std::abs;
   using std::ceil;
   using std::imag;
@@ -54,14 +55,14 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   {
     result(0,1) = A(0,1) / A(0,0);
   }
-  else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+  else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
   {
     result(0,1) = A(0,1) * (logA11 - logA00) / y;
   }
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
     result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
   }
 }
@@ -232,8 +233,8 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   MatrixType T = A, sqrtT;
 
   int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
-  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1:                     // single precision
-                                    maxPadeDegree<= 7? 2.6429608311114350e-1:                     // double precision
+  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+                                    maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
                                     maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
                                     maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
                                                        1.1880960220216759245467951592883642e-1L;  // quadruple precision
@@ -333,9 +334,8 @@ public:
     typedef internal::traits<DerivedEvalTypeClean> Traits;
     static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
     static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-    static const int Options = DerivedEvalTypeClean::Options;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
     typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index f37d31c3f..ebc433d89 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -196,11 +196,11 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
 {
   using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1f:                           // sigle precision
-				    digits <=  53? 2.789358995219730e-1:                    // double precision
-				    digits <=  64? 2.4471944416607995472e-1L:               // extended precision
-				    digits <= 106? 1.1016843812851143391275867258512e-1L:   // double-double
-						   9.134603732914548552537150753385375e-2L; // quadruple precision
+  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1L                            // single precision
+                                  : digits <=  53? 2.789358995219730e-1L                    // double precision
+                                  : digits <=  64? 2.4471944416607995472e-1L                // extended precision
+                                  : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
+                                  :                9.134603732914548552537150753385375e-2L; // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
@@ -264,7 +264,7 @@ inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
       1.999045567181744e-1L, 2.789358995219730e-1L };
 #elif LDBL_MANT_DIG <= 64
   const int maxPadeDegree = 8;
-  const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+  const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
       6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
 #elif LDBL_MANT_DIG <= 106
   const int maxPadeDegree = 10;
@@ -298,7 +298,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
   ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
@@ -383,7 +383,7 @@ class MatrixPower : internal::noncopyable
 
   private:
     typedef std::complex<RealScalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, MatrixType::Options,
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0,
               MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix;
 
     /** \brief Reference to the base of matrix power. */
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index 9f08c6162..afd88ec4d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty
   sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
 }
 
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
-{
-  typedef typename traits<MatrixType>::Scalar Scalar;
-  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
-  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
-  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
-  if (j-i > 2)
-    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
-  Matrix<Scalar,2,2> X;
-  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
-  sqrtT.template block<2,2>(i,j) = X;
-}
-
 // solves the equation A X + X B = C where all matrices are 2-by-2
 template <typename MatrixType>
 void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
@@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
   coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
   coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
   coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-  
+
   Matrix<Scalar,4,1> rhs;
   rhs.coeffRef(0) = C.coeff(0,0);
   rhs.coeffRef(1) = C.coeff(0,1);
   rhs.coeffRef(2) = C.coeff(1,0);
   rhs.coeffRef(3) = C.coeff(1,1);
-  
+
   Matrix<Scalar,4,1> result;
   result = coeffMatrix.fullPivLu().solve(rhs);
 
@@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
   X.coeffRef(1,1) = result.coeff(3);
 }
 
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+  if (j-i > 2)
+    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+  Matrix<Scalar,2,2> X;
+  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+  sqrtT.template block<2,2>(i,j) = X;
+}
 
 // pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
 // post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MoreVectorization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NonLinearOptimization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index b8ba6ddcb..8fe3ed86b 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -150,7 +150,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
@@ -390,7 +390,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &
     fvec.resize(n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 69106ddc5..fe3b79ca7 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -179,7 +179,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
     fjac.resize(m, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
@@ -215,7 +215,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
 {
     using std::abs;
     using std::sqrt;
-    
+
     eigen_assert(x.size()==n); // check the caller is not cheating us
 
     /* calculate the jacobian matrix. */
@@ -398,7 +398,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NumericalDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Polynomials_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Skyline_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseExtra_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 000000000..ed415db99
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igammac(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise complementary incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+  *
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::digamma()
+  */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+    n.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+  *
+  * This function computes the regularized incomplete beta function (integral).
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::betainc(), Eigen::lgamma()
+  */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+    a.derived(),
+    b.derived(),
+    x.derived()
+  );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+  *
+  * It returns the Riemann zeta function of two arguments \a x and \a q:
+  *
+  * \param x is the exposent, it must be > 1
+  * \param q is the shift, it must be > 0
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+  *
+  * \sa ArrayBase::zeta()
+  */
+template<typename DerivedX,typename DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+    x.derived(),
+    q.derived()
+  );
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 000000000..d8f2363be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma
+  */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igamma; return igamma(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igammac
+  */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igammac; return igammac(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+  {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+  *
+  */
+template<typename Scalar> struct scalar_betainc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+    using numext::betainc; return betainc(x, a, b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+  {
+    return internal::pbetainc(x, a, b);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBetaInc
+  };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+        using numext::zeta; return zeta(x, q);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasZeta
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+        using numext::polygamma; return polygamma(n, x);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasPolygamma
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 000000000..553bcda6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/Eigen/src/Core/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
similarity index 66%
rename from Eigen/src/Core/SpecialFunctions.h
rename to unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 3513a5c63..52619fc0c 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -98,7 +98,7 @@ struct polevl<Scalar, 0> {
 }  // end namespace cephes
 
 /****************************************************************************
- * Implementation of lgamma                                                 *
+ * Implementation of lgamma, requires C++11/C99                             *
  ****************************************************************************/
 
 template <typename Scalar>
@@ -116,7 +116,7 @@ struct lgamma_retval {
   typedef Scalar type;
 };
 
-#ifdef EIGEN_HAS_C99_MATH
+#if EIGEN_HAS_C99_MATH
 template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
@@ -131,7 +131,7 @@ struct lgamma_impl<double> {
 #endif
 
 /****************************************************************************
- * Implementation of digamma (psi)                                          *
+ * Implementation of digamma (psi), based on Cephes                         *
  ****************************************************************************/
 
 template <typename Scalar>
@@ -139,20 +139,6 @@ struct digamma_retval {
   typedef Scalar type;
 };
 
-#ifndef EIGEN_HAS_C99_MATH
-
-template <typename Scalar>
-struct digamma_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(Scalar x) {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return Scalar(0);
-  }
-};
-
-#else
-
 /*
  *
  * Polynomial evaluation helper for the Psi (digamma) function.
@@ -284,7 +270,7 @@ struct digamma_impl {
     bool negative = false;
 
     const Scalar maxnum = NumTraits<Scalar>::infinity();
-    const Scalar m_pi(EIGEN_PI);
+    const Scalar m_pi = Scalar(EIGEN_PI);
 
     const Scalar zero = Scalar(0);
     const Scalar one = Scalar(1);
@@ -331,10 +317,8 @@ struct digamma_impl {
   }
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
 /****************************************************************************
- * Implementation of erf                                                    *
+ * Implementation of erf, requires C++11/C99                                *
  ****************************************************************************/
 
 template <typename Scalar>
@@ -352,7 +336,7 @@ struct erf_retval {
   typedef Scalar type;
 };
 
-#ifdef EIGEN_HAS_C99_MATH
+#if EIGEN_HAS_C99_MATH
 template <>
 struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
@@ -367,7 +351,7 @@ struct erf_impl<double> {
 #endif  // EIGEN_HAS_C99_MATH
 
 /***************************************************************************
-* Implementation of erfc                                                   *
+* Implementation of erfc, requires C++11/C99                               *
 ****************************************************************************/
 
 template <typename Scalar>
@@ -385,7 +369,7 @@ struct erfc_retval {
   typedef Scalar type;
 };
 
-#ifdef EIGEN_HAS_C99_MATH
+#if EIGEN_HAS_C99_MATH
 template <>
 struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
@@ -399,16 +383,62 @@ struct erfc_impl<double> {
 };
 #endif  // EIGEN_HAS_C99_MATH
 
-/****************************************************************************
- * Implementation of igammac (complemented incomplete gamma integral)       *
- ****************************************************************************/
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
 
 template <typename Scalar>
 struct igammac_retval {
   typedef Scalar type;
 };
 
-#ifndef EIGEN_HAS_C99_MATH
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float biginv() {
+    // epsneg
+    return machep();
+  }
+};
+
+template <>
+struct cephes_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double big() {
+    return 1.0 / NumTraits<double>::epsilon();
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double biginv() {
+    // inverse of eps
+    return NumTraits<double>::epsilon();
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
 struct igammac_impl {
@@ -424,39 +454,6 @@ struct igammac_impl {
 
 template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
 
-template <typename Scalar>
-struct igamma_helper {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
-};
-
-template <>
-struct igamma_helper<float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float machep() {
-    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
-  }
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float big() {
-    // use epsneg (1.0 - epsneg == 1.0)
-    return 1.0 / (NumTraits<float>::epsilon() / 2);
-  }
-};
-
-template <>
-struct igamma_helper<double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double machep() {
-    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
-  }
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double big() {
-    return 1.0 / NumTraits<double>::epsilon();
-  }
-};
-
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -553,10 +550,10 @@ struct igammac_impl {
     const Scalar zero = 0;
     const Scalar one = 1;
     const Scalar two = 2;
-    const Scalar machep = igamma_helper<Scalar>::machep();
+    const Scalar machep = cephes_helper<Scalar>::machep();
     const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-    const Scalar big = igamma_helper<Scalar>::big();
-    const Scalar biginv = 1 / big;
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
     const Scalar inf = NumTraits<Scalar>::infinity();
 
     Scalar ans, ax, c, yc, r, t, y, z;
@@ -605,7 +602,9 @@ struct igammac_impl {
         qkm2 *= biginv;
         qkm1 *= biginv;
       }
-      if (t <= machep) break;
+      if (t <= machep) {
+        break;
+      }
     }
 
     return (ans * ax);
@@ -614,16 +613,16 @@ struct igammac_impl {
 
 #endif  // EIGEN_HAS_C99_MATH
 
-/****************************************************************************
- * Implementation of igamma (incomplete gamma integral)                     *
- ****************************************************************************/
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
 
 template <typename Scalar>
 struct igamma_retval {
   typedef Scalar type;
 };
 
-#ifndef EIGEN_HAS_C99_MATH
+#if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
 struct igamma_impl {
@@ -739,10 +738,10 @@ struct igamma_impl {
   EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
     const Scalar zero = 0;
     const Scalar one = 1;
-    const Scalar machep = igamma_helper<Scalar>::machep();
+    const Scalar machep = cephes_helper<Scalar>::machep();
     const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
 
-    double ans, ax, c, r;
+    Scalar ans, ax, c, r;
 
     /* Compute  x**a * exp(-x) / gamma(a)  */
     ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
@@ -761,7 +760,9 @@ struct igamma_impl {
       r += one;
       c *= x/r;
       ans += c;
-      if (c/ans <= machep) break;
+      if (c/ans <= machep) {
+        break;
+      }
     }
 
     return (ans * ax / a);
@@ -770,28 +771,14 @@ struct igamma_impl {
 
 #endif  // EIGEN_HAS_C99_MATH
 
-/****************************************************************************
- * Implementation of Riemann zeta function of two arguments                 *
- ****************************************************************************/
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
 
 template <typename Scalar>
 struct zeta_retval {
     typedef Scalar type;
 };
-    
-#ifndef EIGEN_HAS_C99_MATH
-    
-template <typename Scalar>
-struct zeta_impl {
-    EIGEN_DEVICE_FUNC
-    static EIGEN_STRONG_INLINE Scalar run(Scalar x, Scalar q) {
-        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                            THIS_TYPE_IS_NOT_SUPPORTED);
-        return Scalar(0);
-    }
-};
-    
-#else
 
 template <typename Scalar>
 struct zeta_impl_series {
@@ -928,7 +915,7 @@ struct zeta_impl {
             
         const Scalar maxnum = NumTraits<Scalar>::infinity();
         const Scalar zero = 0.0, half = 0.5, one = 1.0;
-        const Scalar machep = igamma_helper<Scalar>::machep();
+        const Scalar machep = cephes_helper<Scalar>::machep();
         const Scalar nan = NumTraits<Scalar>::quiet_NaN();
         
         if( x == one )
@@ -976,8 +963,9 @@ struct zeta_impl {
             t = a*b/A[i];
             s = s + t;
             t = numext::abs(t/s);
-            if( t < machep )
-                return s;
+            if( t < machep ) {
+              break;
+            }
             k += one;
             a *= x + k;
             b /= w;
@@ -986,11 +974,9 @@ struct zeta_impl {
         return s;
   }
 };
-    
-#endif  // EIGEN_HAS_C99_MATH
 
 /****************************************************************************
- * Implementation of polygamma function                                     *
+ * Implementation of polygamma function, requires C++11/C99                 *
  ****************************************************************************/
 
 template <typename Scalar>
@@ -998,7 +984,7 @@ struct polygamma_retval {
     typedef Scalar type;
 };
     
-#ifndef EIGEN_HAS_C99_MATH
+#if !EIGEN_HAS_C99_MATH
     
 template <typename Scalar>
 struct polygamma_impl {
@@ -1038,6 +1024,467 @@ struct polygamma_impl {
     
 #endif  // EIGEN_HAS_C99_MATH
 
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+    /*	betaincf.c
+     *
+     *	Incomplete beta integral
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float a, b, x, y, betaincf();
+     *
+     * y = betaincf( a, b, x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns incomplete beta integral of the arguments, evaluated
+     * from zero to x.  The function is defined as
+     *
+     *                  x
+     *     -            -
+     *    | (a+b)      | |  a-1     b-1
+     *  -----------    |   t   (1-t)   dt.
+     *   -     -     | |
+     *  | (a) | (b)   -
+     *                 0
+     *
+     * The domain of definition is 0 <= x <= 1.  In this
+     * implementation a and b are restricted to positive values.
+     * The integral from x to 1 may be obtained by the symmetry
+     * relation
+     *
+     *    1 - betainc( a, b, x )  =  betainc( b, a, 1-x ).
+     *
+     * The integral is evaluated by a continued fraction expansion.
+     * If a < 1, the function calls itself recursively after a
+     * transformation to increase a to a+1.
+     *
+     * ACCURACY (float):
+     *
+     * Tested at random points (a,b,x) with a and b in the indicated
+     * interval and x between 0 and 1.
+     *
+     * arithmetic   domain     # trials      peak         rms
+     * Relative error:
+     *    IEEE       0,30       10000       3.7e-5      5.1e-6
+     *    IEEE       0,100      10000       1.7e-4      2.5e-5
+     * The useful domain for relative error is limited by underflow
+     * of the single precision exponential function.
+     * Absolute error:
+     *    IEEE       0,30      100000       2.2e-5      9.6e-7
+     *    IEEE       0,100      10000       6.5e-5      3.7e-6
+     *
+     * Larger errors may occur for extreme ratios of a and b.
+     *
+     * ACCURACY (double):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,5         10000       6.9e-15     4.5e-16
+     *    IEEE      0,85       250000       2.2e-13     1.7e-14
+     *    IEEE      0,1000      30000       5.3e-12     6.3e-13
+     *    IEEE      0,10000    250000       9.3e-11     7.1e-12
+     *    IEEE      0,100000    10000       8.7e-10     4.8e-11
+     * Outputs smaller than the IEEE gradual underflow threshold
+     * were excluded from these statistics.
+     *
+     * ERROR MESSAGES:
+     *   message         condition      value returned
+     * incbet domain      x<0, x>1          nan
+     * incbet underflow                     nan
+     */
+
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+                         internal::is_same<Scalar, double>::value),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+
+    Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+    Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+    Scalar ans;
+    int n;
+
+    const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+    const Scalar thresh =
+        (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+    Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+    if (small_branch) {
+      k1 = a;
+      k2 = a + b;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = b - one;
+      k7 = k4;
+      k8 = a + two;
+      k26update = one;
+    } else {
+      k1 = a;
+      k2 = b - one;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = a + b;
+      k7 = a + one;
+      k8 = a + two;
+      k26update = -one;
+      x = x / (one - x);
+    }
+
+    pkm2 = zero;
+    qkm2 = one;
+    pkm1 = one;
+    qkm1 = one;
+    ans = one;
+    n = 0;
+
+    do {
+      xk = -(x * k1 * k2) / (k3 * k4);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      xk = (x * k5 * k6) / (k7 * k8);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      if (qk != zero) {
+        r = pk / qk;
+        if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+          return r;
+        }
+        ans = r;
+      }
+
+      k1 += one;
+      k2 += k26update;
+      k3 += two;
+      k4 += two;
+      k5 += one;
+      k6 -= k26update;
+      k7 += two;
+      k8 += two;
+
+      if ((numext::abs(qk) + numext::abs(pk)) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+        pkm2 *= big;
+        pkm1 *= big;
+        qkm2 *= big;
+        qkm1 *= big;
+      }
+    } while (++n < num_iters);
+
+    return ans;
+  }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+  /* Core implementation, assumes a large (> 1.0) */
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+                                                            float xx) {
+    float ans, a, b, t, x, onemx;
+    bool reversed_a_b = false;
+
+    onemx = 1.0f - xx;
+
+    /* see if x is greater than the mean */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      t = xx;
+      x = onemx;
+    } else {
+      a = aa;
+      b = bb;
+      t = onemx;
+      x = xx;
+    }
+
+    /* Choose expansion for optimal convergence */
+    if (b > 10.0f) {
+      if (numext::abs(b * x / a) < 0.3f) {
+        t = betainc_helper<float>::incbps(a, b, x);
+        if (reversed_a_b) t = 1.0f - t;
+        return t;
+      }
+    }
+
+    ans = x * (a + b - 2.0f) / (a - 1.0f);
+    if (ans < 1.0f) {
+      ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+      t = b * numext::log(t);
+    } else {
+      ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+      t = (b - 1.0f) * numext::log(t);
+    }
+
+    t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+         lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+    t += numext::log(ans / a);
+    t = numext::exp(t);
+
+    if (reversed_a_b) t = 1.0f - t;
+    return t;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+    float t, u, y, s;
+    const float machep = cephes_helper<float>::machep();
+
+    y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+    y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+    y += lgamma_impl<float>::run(a + b);
+
+    t = x / (1.0f - x);
+    s = 0.0f;
+    u = 1.0f;
+    do {
+      b -= 1.0f;
+      if (b == 0.0f) {
+        break;
+      }
+      a += 1.0f;
+      u *= t * b / a;
+      s += u;
+    } while (numext::abs(u) > machep);
+
+    return numext::exp(y) * (1.0f + s);
+  }
+};
+
+template <>
+struct betainc_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static float run(float a, float b, float x) {
+    const float nan = NumTraits<float>::quiet_NaN();
+    float ans, t;
+
+    if (a <= 0.0f) return nan;
+    if (b <= 0.0f) return nan;
+    if ((x <= 0.0f) || (x >= 1.0f)) {
+      if (x == 0.0f) return 0.0f;
+      if (x == 1.0f) return 1.0f;
+      // mtherr("betaincf", DOMAIN);
+      return nan;
+    }
+
+    /* transformation for small aa */
+    if (a <= 1.0f) {
+      ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+      t = a * numext::log(x) + b * numext::log1p(-x) +
+          lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+          lgamma_impl<float>::run(b);
+      return (ans + numext::exp(t));
+    } else {
+      return betainc_helper<float>::incbsa(a, b, x);
+    }
+  }
+};
+
+template <>
+struct betainc_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+    const double machep = cephes_helper<double>::machep();
+
+    double s, t, u, v, n, t1, z, ai;
+
+    ai = 1.0 / a;
+    u = (1.0 - b) * x;
+    v = u / (a + 1.0);
+    t1 = v;
+    t = u;
+    n = 2.0;
+    s = 0.0;
+    z = machep * ai;
+    while (numext::abs(v) > z) {
+      u = (n - b) * x / n;
+      t *= u;
+      v = t / (a + n);
+      s += v;
+      n += 1.0;
+    }
+    s += t1;
+    s += ai;
+
+    u = a * numext::log(x);
+    // TODO: gamma() is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+      t = gamma(a + b) / (gamma(a) * gamma(b));
+      s = s * t * pow(x, a);
+    } else {
+    */
+    t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+        lgamma_impl<double>::run(b) + u + numext::log(s);
+    return s = numext::exp(t);
+  }
+};
+
+template <>
+struct betainc_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static double run(double aa, double bb, double xx) {
+    const double nan = NumTraits<double>::quiet_NaN();
+    const double machep = cephes_helper<double>::machep();
+    // const double maxgam = 171.624376956302725;
+
+    double a, b, t, x, xc, w, y;
+    bool reversed_a_b = false;
+
+    if (aa <= 0.0 || bb <= 0.0) {
+      return nan;  // goto domerr;
+    }
+
+    if ((xx <= 0.0) || (xx >= 1.0)) {
+      if (xx == 0.0) return (0.0);
+      if (xx == 1.0) return (1.0);
+      // mtherr("incbet", DOMAIN);
+      return nan;
+    }
+
+    if ((bb * xx) <= 1.0 && xx <= 0.95) {
+      return betainc_helper<double>::incbps(aa, bb, xx);
+    }
+
+    w = 1.0 - xx;
+
+    /* Reverse a and b if x is greater than the mean. */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      xc = xx;
+      x = w;
+    } else {
+      a = aa;
+      b = bb;
+      xc = w;
+      x = xx;
+    }
+
+    if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+      t = betainc_helper<double>::incbps(a, b, x);
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+      return t;
+    }
+
+    /* Choose expansion for better convergence. */
+    y = x * (a + b - 2.0) - (a - 1.0);
+    if (y < 0.0) {
+      w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+    } else {
+      w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+    }
+
+    /* Multiply w by the factor
+         a      b   _             _     _
+        x  (1-x)   | (a+b) / ( a | (a) | (b) ) .   */
+
+    y = a * numext::log(x);
+    t = b * numext::log(xc);
+    // TODO: gamma is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+    {
+      t = pow(xc, b);
+      t *= pow(x, a);
+      t /= a;
+      t *= w;
+      t *= gamma(a + b) / (gamma(a) * gamma(b));
+    } else {
+    */
+    /* Resort to logarithms.  */
+    y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+         lgamma_impl<double>::run(b);
+    y += numext::log(w / a);
+    t = numext::exp(y);
+
+    /* } */
+    // done:
+
+    if (reversed_a_b) {
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+    }
+    return t;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
 }  // end namespace internal
 
 namespace numext {
@@ -1053,7 +1500,7 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
     digamma(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
 }
-    
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
 zeta(const Scalar& x, const Scalar& q) {
@@ -1090,6 +1537,12 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
   return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+    betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
 }  // end namespace numext
 
 
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 000000000..46d60d323
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Splines_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index d1636f466..627f6e482 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -94,7 +94,7 @@ namespace Eigen
     const KnotVectorType& knots() const { return m_knots; }
     
     /**
-     * \brief Returns the knots of the underlying spline.
+     * \brief Returns the ctrls of the underlying spline.
      **/    
     const ControlPointVectorType& ctrls() const { return m_ctrls; }
 
@@ -394,7 +394,7 @@ namespace Eigen
 
     Matrix<Scalar,Order,Order> ndu(p+1,p+1);
 
-    double saved, temp;
+    Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
 
     ndu(0,0) = 1.0;
 
@@ -433,7 +433,7 @@ namespace Eigen
       // Compute the k-th derivative
       for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
       {
-        double d = 0.0;
+        Scalar d = 0.0;
         DenseIndex rk,pk,j1,j2;
         rk = r-k; pk = p-k;
 
diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
index 6b6fac075..afb0c94c2 100644
--- a/unsupported/doc/examples/BVH_Example.cpp
+++ b/unsupported/doc/examples/BVH_Example.cpp
@@ -6,9 +6,7 @@ using namespace Eigen;
 typedef AlignedBox<double, 2> Box2d;
 
 namespace Eigen {
-    namespace internal {
-        Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
-    }
+  Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
 }
 
 struct PointPointMinimizer //how to compute squared distances between points and rectangles
diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp
new file mode 100644
index 000000000..1ef6aee18
--- /dev/null
+++ b/unsupported/doc/examples/EulerAngles.cpp
@@ -0,0 +1,46 @@
+#include <unsupported/Eigen/EulerAngles>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  // A common Euler system by many armies around the world,
+  //  where the first one is the azimuth(the angle from the north -
+  //   the same angle that is show in compass)
+  //  and the second one is elevation(the angle from the horizon)
+  //  and the third one is roll(the angle between the horizontal body
+  //   direction and the plane ground surface)
+  // Keep remembering we're using radian angles here!
+  typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem;
+  typedef EulerAngles<double, MyArmySystem> MyArmyAngles;
+  
+  MyArmyAngles vehicleAngles(
+    3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */
+    -0.3, /* going down from a mountain */
+    0.1); /* slightly rolled to the right */
+  
+  // Some Euler angles representation that our plane use.
+  EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
+  
+  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  
+  std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
+  std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
+  std::cout << "plane angles(MyArmy):     " << planeAnglesInMyArmyAngles << std::endl;
+  
+  // Now lets rotate the plane a little bit
+  std::cout << "==========================================================\n";
+  std::cout << "rotating plane now!\n";
+  std::cout << "==========================================================\n";
+  
+  Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
+  
+  planeAngles = planeRotated;
+  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  
+  std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
+  std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
+  
+  return 0;
+}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 22442b394..a1823beaa 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -59,6 +59,8 @@ ei_add_test(alignedvector3)
 
 ei_add_test(FFT)
 
+ei_add_test(EulerAngles)
+
 find_package(MPFR 2.3.0)
 find_package(GMP)
 if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
@@ -109,10 +111,14 @@ ei_add_test(gmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
+ei_add_test(special_functions)
 
 # TODO: The following test names are prefixed with the cxx11 string, since historically
 # the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-ei_add_test(cxx11_float16)
+# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
+# when using visual studio. We should make the check more strict to enable the tests for
+# newer versions of MSVC.
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
 ei_add_test(cxx11_tensor_dimension)
 ei_add_test(cxx11_tensor_map)
 ei_add_test(cxx11_tensor_assign)
@@ -130,7 +136,8 @@ ei_add_test(cxx11_tensor_io)
 if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
   # This test requires __uint128_t which is only available on 64bit systems 
   ei_add_test(cxx11_tensor_uint128)
-endif() 
+endif()
+endif()
 
 if(EIGEN_TEST_CXX11)
   # It should be safe to always run these tests as there is some fallback code for
@@ -139,6 +146,8 @@ if(EIGEN_TEST_CXX11)
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+
   ei_add_test(cxx11_meta)
   ei_add_test(cxx11_tensor_simple)
 #  ei_add_test(cxx11_tensor_symmetry)
@@ -174,6 +183,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_custom_index)
   ei_add_test(cxx11_tensor_fft)
   ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_scan)
 
 endif()
 
@@ -183,37 +193,58 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
   # and -fno-check-new flags since they trigger thousands of compilation warnings
   # in the CUDA runtime
+  # Also remove -ansi that is incompatible with std=c++11.
   string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
   message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
 
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
   endif()
 
-  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+
+  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
+    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
+  else()
+    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_CUDA_CXX11_FLAG "")
+  endif()
+
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_device)
-  ei_add_test(cxx11_tensor_cuda)
-  ei_add_test(cxx11_tensor_contract_cuda)
+  ei_add_test(cxx11_tensor_complex_cuda)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
   ei_add_test(cxx11_tensor_reduction_cuda)
   ei_add_test(cxx11_tensor_argmax_cuda)
   ei_add_test(cxx11_tensor_cast_float16_cuda)
+  ei_add_test(cxx11_tensor_scan_cuda)
+
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_cuda)
+    ei_add_test(cxx11_tensor_contract_cuda)
+    ei_add_test(cxx11_tensor_of_float16_cuda)
+  endif()
 
   # The random number generation code requires arch 3.5 or greater.
   if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
     ei_add_test(cxx11_tensor_random_cuda)
   endif()
 
-  ei_add_test(cxx11_tensor_of_float16_cuda)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
new file mode 100644
index 000000000..a8cb52864
--- /dev/null
+++ b/unsupported/test/EulerAngles.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+{
+  typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+  typedef AngleAxis<Scalar> AngleAxisType;
+  using std::abs;
+  
+  Scalar alphaRangeStart, alphaRangeEnd;
+  Scalar betaRangeStart, betaRangeEnd;
+  Scalar gammaRangeStart, gammaRangeEnd;
+  
+  if (positiveRangeAlpha)
+  {
+    alphaRangeStart = Scalar(0);
+    alphaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    alphaRangeStart = -Scalar(EIGEN_PI);
+    alphaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeBeta)
+  {
+    betaRangeStart = Scalar(0);
+    betaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    betaRangeStart = -Scalar(EIGEN_PI);
+    betaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeGamma)
+  {
+    gammaRangeStart = Scalar(0);
+    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    gammaRangeStart = -Scalar(EIGEN_PI);
+    gammaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  const int i = EulerSystem::AlphaAxisAbs - 1;
+  const int j = EulerSystem::BetaAxisAbs - 1;
+  const int k = EulerSystem::GammaAxisAbs - 1;
+  
+  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+  
+  const Vector3 I = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J = EulerAnglesType::BetaAxisVector();
+  const Vector3 K = EulerAnglesType::GammaAxisVector();
+  
+  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  
+  Matrix3 m(e);
+  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  
+  // Check that eabis in range
+  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  
+  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  
+  // Invert the relevant axes
+  eabis2[0] *= iFactor;
+  eabis2[1] *= jFactor;
+  eabis2[2] *= kFactor;
+  
+  // Saturate the angles to the correct range
+  if (positiveRangeAlpha && (eabis2[0] < 0))
+    eabis2[0] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeBeta && (eabis2[1] < 0))
+    eabis2[1] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeGamma && (eabis2[2] < 0))
+    eabis2[2] += Scalar(2 * EIGEN_PI);
+  
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  
+  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+  VERIFY_IS_APPROX(m,  mbis);
+  
+  // Tests that are only relevant for no possitive range
+  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  {
+    /* If I==K, and ea[1]==0, then there no unique solution. */ 
+    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
+    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+    
+    // approx_or_less_than does not work for 0
+    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+  }
+  
+  // Quaternions
+  QuaternionType q(e);
+  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+}
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_ranged<EulerSystem>(ea, false, false, false);
+  verify_euler_ranged<EulerSystem>(ea, false, false, true);
+  verify_euler_ranged<EulerSystem>(ea, false, true, false);
+  verify_euler_ranged<EulerSystem>(ea, false, true, true);
+  verify_euler_ranged<EulerSystem>(ea, true, false, false);
+  verify_euler_ranged<EulerSystem>(ea, true, false, true);
+  verify_euler_ranged<EulerSystem>(ea, true, true, false);
+  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler<EulerSystemXYZ>(ea);
+  verify_euler<EulerSystemXYX>(ea);
+  verify_euler<EulerSystemXZY>(ea);
+  verify_euler<EulerSystemXZX>(ea);
+  
+  verify_euler<EulerSystemYZX>(ea);
+  verify_euler<EulerSystemYZY>(ea);
+  verify_euler<EulerSystemYXZ>(ea);
+  verify_euler<EulerSystemYXY>(ea);
+  
+  verify_euler<EulerSystemZXY>(ea);
+  verify_euler<EulerSystemZXZ>(ea);
+  verify_euler<EulerSystemZYX>(ea);
+  verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisType(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+void test_EulerAngles()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eulerangles<float>() );
+    CALL_SUBTEST_2( eulerangles<double>() );
+  }
+}
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index d3718e2d2..8b7528fb7 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -18,11 +18,11 @@ using namespace Eigen;
 
 
 template < typename T>
-complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
 
-complex<long double>  promote(float x) { return complex<long double>( x); }
-complex<long double>  promote(double x) { return complex<long double>( x); }
-complex<long double>  promote(long double x) { return complex<long double>( x); }
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
     
 
     template <typename VT1,typename VT2>
@@ -33,7 +33,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double pi = acos((long double)-1 );
         for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
             complex<long double> acc = 0;
-            long double phinc = -2.*k0* pi / timebuf.size();
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
             for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
                 acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
             }
@@ -54,8 +54,8 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double difpower=0;
         size_t n = (min)( buf1.size(),buf2.size() );
         for (size_t k=0;k<n;++k) {
-            totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2.;
-            difpower += numext::abs2(buf1[k] - buf2[k]);
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
         }
         return sqrt(difpower/totalpower);
     }
@@ -93,19 +93,19 @@ void test_scalar_generic(int nfft)
     fft.SetFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     fft.ClearFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     if (nfft&1)
         return; // odd FFTs get the wrong size inverse FFT
 
     ScalarVector tbuf2;
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 
 
     // verify that the Unscaled flag takes effect
@@ -121,12 +121,12 @@ void test_scalar_generic(int nfft)
     //for (size_t i=0;i<(size_t) tbuf.size();++i)
     //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
 
-    VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
@@ -152,10 +152,10 @@ void test_complex_generic(int nfft)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 
     // verify that the Unscaled flag takes effect
     ComplexVector buf4;
@@ -163,12 +163,12 @@ void test_complex_generic(int nfft)
     fft.inv( buf4 , outbuf);
     for (int k=0;k<nfft;++k)
         buf4[k] *= T(1./nfft);
-    VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( buf3 , outbuf);
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 374f86df9..85743137e 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -16,7 +16,8 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
   using namespace std;
 //   return x+std::sin(y);
   EIGEN_ASM_COMMENT("mybegin");
-  return static_cast<Scalar>(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0));
+  // pow(float, int) promotes to pow(double, double)
+  return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0);
   //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
   EIGEN_ASM_COMMENT("myend");
 }
@@ -104,6 +105,89 @@ struct TestFunc1
   }
 };
 
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
 template<typename Func> void forward_jacobian(const Func& f)
 {
     typename Func::InputType x = Func::InputType::Random(f.inputs());
@@ -127,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_scalar()
@@ -140,6 +223,7 @@ void test_autodiff_scalar()
   VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
 }
 
+
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_vector()
@@ -150,7 +234,7 @@ void test_autodiff_vector()
   VectorAD ap = p.cast<AD>();
   ap.x().derivatives() = Vector2f::UnitX();
   ap.y().derivatives() = Vector2f::UnitY();
-  
+
   AD res = foo<VectorAD>(ap);
   VERIFY_IS_APPROX(res.value(), foo(p));
 }
@@ -163,6 +247,9 @@ void test_autodiff_jacobian()
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
 }
 
 
@@ -204,9 +291,64 @@ void test_autodiff_hessian()
   VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
   VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
   VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+  ADD z = x(0)*x(1);
+  VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+  VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
 }
 
+double bug_1222() {
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  // this line did not work, because operator+ returns ADS<DerType&>, which then cannot be converted to ADS<DerType>
+  const AD denom = chi_3 + _cv1_3;
+  return denom.value();
+}
 
+double bug_1223() {
+  using std::min;
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  const AD denom = 1.0;
+
+  // failed because implementation of min attempts to construct ADS<DerType&> via constructor AutoDiffScalar(const Real& value)
+  // without initializing m_derivatives (which is a reference in this case)
+  #define EIGEN_TEST_SPACE
+  const AD t = min EIGEN_TEST_SPACE (denom / chi_3, 1.0);
+
+  const AD t2 = min EIGEN_TEST_SPACE (denom / (chi_3 * _cv1_3), 1.0);
+
+  return t.value() + t2.value();
+}
+
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+  Matrix4d A;
+  Vector4d v;
+  A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+  typedef AutoDiffScalar<Matrix2d> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+
+  VectorAD v;
+  const AD maxVal = v.maxCoeff();
+  const AD minVal = v.minCoeff();
+  return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+  typedef AutoDiffScalar<Vector2d> AD;
+  const AD s;
+  const Matrix<AD, 3, 1> v1;
+  const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+  return v2(0).value();
+}
 
 void test_autodiff()
 {
@@ -216,5 +358,10 @@ void test_autodiff()
     CALL_SUBTEST_3( test_autodiff_jacobian<1>() );
     CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
+
+  bug_1222();
+  bug_1223();
+  bug_1260();
+  bug_1261();
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index c631c734a..4df2f5c57 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -36,13 +36,48 @@ template<typename Scalar> void check_atan2()
   VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
 }
 
+template<typename Scalar> void check_hyperbolic_functions()
+{
+  using std::sinh;
+  using std::cosh;
+  using std::tanh;
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  Deriv1 p = Deriv1::Random();
+  AD val(p.x(),Deriv1::UnitX());
 
+  Scalar cosh_px = std::cosh(p.x());
+  AD res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
 
+  AD res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+  VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
+
+  AD res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.value(), cosh_px);
+  VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+  // Check constant values.
+  const Scalar sample_point = Scalar(1) / Scalar(3); 
+  val = AD(sample_point,Deriv1::UnitX());
+  res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+  res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+  res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
 
 void test_autodiff_scalar()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
     CALL_SUBTEST_2( check_atan2<double>() );
+    CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+    CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
   }
 }
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index f16cc6f07..3b598bf42 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -25,7 +25,8 @@ int rand_reentrant(unsigned int* s) {
 
 static void test_basic_eventcount()
 {
-  std::vector<EventCount::Waiter> waiters(1);
+  MaxSizeVector<EventCount::Waiter> waiters(1);
+  waiters.resize(1);
   EventCount ec(waiters);
   EventCount::Waiter& w = waiters[0];
   ec.Notify(false);
@@ -81,7 +82,8 @@ static void test_stress_eventcount()
   static const int kEvents = 1 << 16;
   static const int kQueues = 10;
 
-  std::vector<EventCount::Waiter> waiters(kThreads);
+  MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+  waiters.resize(kThreads);
   EventCount ec(waiters);
   TestQueue queues[kQueues];
 
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
new file mode 100644
index 000000000..5f9bb938b
--- /dev/null
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include "Eigen/CXX11/ThreadPool"
+
+static void test_create_destroy_empty_pool()
+{
+  // Just create and destroy the pool. This will wind up and tear down worker
+  // threads. Ensure there are no issues in that logic.
+  for (int i = 0; i < 16; ++i) {
+    NonBlockingThreadPool tp(i);
+  }
+}
+
+
+static void test_parallelism()
+{
+  // Test we never-ever fail to match available tasks with idle threads.
+  const int kThreads = 16;  // code below expects that this is a multiple of 4
+  NonBlockingThreadPool tp(kThreads);
+  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
+  for (int iter = 0; iter < 100; ++iter) {
+    std::atomic<int> running(0);
+    std::atomic<int> done(0);
+    std::atomic<int> phase(0);
+    // Schedule kThreads tasks and ensure that they all are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
+        running++;
+        while (phase < 1) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 1;
+    // Now, while the previous tasks exit, schedule another kThreads tasks and
+    // ensure that they are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&, i]() {
+        running++;
+        while (phase < 2) {
+        }
+        // When all tasks are running, half of tasks exit, quarter of tasks
+        // continue running and quarter of tasks schedule another 2 tasks each.
+        // Concurrently main thread schedules another quarter of tasks.
+        // This gives us another kThreads tasks and we ensure that they all
+        // are running.
+        if (i < kThreads / 2) {
+        } else if (i < 3 * kThreads / 4) {
+          running++;
+          while (phase < 3) {
+          }
+          done++;
+        } else {
+          for (int j = 0; j < 2; ++j) {
+            tp.Schedule([&]() {
+              running++;
+              while (phase < 3) {
+              }
+              done++;
+            });
+          }
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 2;
+    for (int i = 0; i < kThreads / 4; ++i) {
+      tp.Schedule([&]() {
+        running++;
+        while (phase < 3) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    phase = 3;
+    while (done != 3 * kThreads) {
+    }
+  }
+}
+
+void test_cxx11_non_blocking_thread_pool()
+{
+  CALL_SUBTEST(test_create_destroy_empty_pool());
+  CALL_SUBTEST(test_parallelism());
+}
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index d1770ee1b..91f690114 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -33,73 +33,81 @@ void test_basic_runqueue()
   VERIFY_IS_EQUAL(0u, q.Size());
   VERIFY_IS_EQUAL(0, q.PopFront());
   std::vector<int> stolen;
-  VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
   VERIFY_IS_EQUAL(0u, stolen.size());
   // Push one front, pop one front.
   VERIFY_IS_EQUAL(0, q.PushFront(1));
-  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1u, q.Size());
   VERIFY_IS_EQUAL(1, q.PopFront());
-  VERIFY_IS_EQUAL(0, q.Size());
+  VERIFY_IS_EQUAL(0u, q.Size());
   // Push front to overflow.
   VERIFY_IS_EQUAL(0, q.PushFront(2));
-  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushFront(3));
-  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(2u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushFront(4));
-  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(3u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushFront(5));
-  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(4u, q.Size());
   VERIFY_IS_EQUAL(6, q.PushFront(6));
-  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(4u, q.Size());
   VERIFY_IS_EQUAL(5, q.PopFront());
-  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(3u, q.Size());
   VERIFY_IS_EQUAL(4, q.PopFront());
-  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(2u, q.Size());
   VERIFY_IS_EQUAL(3, q.PopFront());
-  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1u, q.Size());
   VERIFY_IS_EQUAL(2, q.PopFront());
-  VERIFY_IS_EQUAL(0, q.Size());
+  VERIFY_IS_EQUAL(0u, q.Size());
   VERIFY_IS_EQUAL(0, q.PopFront());
   // Push one back, pop one back.
   VERIFY_IS_EQUAL(0, q.PushBack(7));
-  VERIFY_IS_EQUAL(1, q.Size());
-  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
-  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
   VERIFY_IS_EQUAL(7, stolen[0]);
-  VERIFY_IS_EQUAL(0, q.Size());
+  VERIFY_IS_EQUAL(0u, q.Size());
   stolen.clear();
   // Push back to overflow.
   VERIFY_IS_EQUAL(0, q.PushBack(8));
-  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushBack(9));
-  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(2u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushBack(10));
-  VERIFY_IS_EQUAL(3, q.Size());
+  VERIFY_IS_EQUAL(3u, q.Size());
   VERIFY_IS_EQUAL(0, q.PushBack(11));
-  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(4u, q.Size());
   VERIFY_IS_EQUAL(12, q.PushBack(12));
-  VERIFY_IS_EQUAL(4, q.Size());
+  VERIFY_IS_EQUAL(4u, q.Size());
   // Pop back in halves.
-  VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen));
-  VERIFY_IS_EQUAL(2, stolen.size());
+  VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2u, stolen.size());
   VERIFY_IS_EQUAL(10, stolen[0]);
   VERIFY_IS_EQUAL(11, stolen[1]);
-  VERIFY_IS_EQUAL(2, q.Size());
+  VERIFY_IS_EQUAL(2u, q.Size());
   stolen.clear();
-  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
-  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
   VERIFY_IS_EQUAL(9, stolen[0]);
-  VERIFY_IS_EQUAL(1, q.Size());
+  VERIFY_IS_EQUAL(1u, q.Size());
   stolen.clear();
-  VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen));
-  VERIFY_IS_EQUAL(1, stolen.size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
   VERIFY_IS_EQUAL(8, stolen[0]);
   stolen.clear();
-  VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen));
-  VERIFY_IS_EQUAL(0, stolen.size());
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
   // Empty again.
   VERIFY(q.Empty());
-  VERIFY_IS_EQUAL(0, q.Size());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(1, q.PopBack());
+  VERIFY_IS_EQUAL(2, q.PopBack());
+  VERIFY_IS_EQUAL(3, q.PopBack());
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
 }
 
 // Empty tests that the queue is not claimed to be empty when is is in fact not.
@@ -130,7 +138,7 @@ void test_empty_runqueue()
             stolen.clear();
             break;
           }
-          VERIFY_IS_EQUAL(0, stolen.size());
+          VERIFY_IS_EQUAL(0u, stolen.size());
         }
       }
     }
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 41ccbe974..6fe8982f2 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -12,6 +12,9 @@
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index e5cf61fe1..8fe85d83c 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -286,7 +286,7 @@ static void test_compound_assign()
 }
 
 static void test_std_initializers_tensor() {
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   Tensor<int, 1> a(3);
   a.setValues({0, 1, 2});
   VERIFY_IS_EQUAL(a(0), 0);
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 2ddf47234..5c0ea5889 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -115,7 +115,7 @@ static void test_static_broadcasting()
   Tensor<float, 3, DataLayout> tensor(8,3,5);
   tensor.setRandom();
 
-#ifdef EIGEN_HAS_CONSTEXPR
+#if EIGEN_HAS_CONSTEXPR
   Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
 #else
   Eigen::array<int, 3> broadcasts;
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index f22b99de8..88c233994 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
new file mode 100644
index 000000000..f895efd01
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<std::complex<float>, 1, 0, int> in1(2);
+  Tensor<std::complex<float>, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t float_bytes = in1.size() * sizeof(float);
+  std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+  std::complex<float>* d_in1;
+  std::complex<float>* d_in2;
+  float* d_out2;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out2), float_bytes);
+  cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+      d_out2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+  gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+  Tensor<std::complex<float>, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+    VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out2);
+}
+
+
+static void test_cuda_sum_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+void test_cxx11_tensor_complex()
+{
+  CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
+}
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
new file mode 100644
index 000000000..2baf5eaad
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add = 0,
+    Sub,
+    Mul,
+    Div
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op <= Div; op++) {
+    std::complex<T> expected;
+    switch (static_cast<CwiseOp>(op)) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_complex_cwise_ops()
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 6d1ef07f9..767e9c678 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -14,7 +14,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -84,6 +86,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   cudaFree((void*)d_t_result);
 }
 
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
 template<int DataLayout>
 void test_cuda_contraction_m() {
   for (int k = 32; k < 256; k++) {
@@ -138,6 +199,9 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
 
+  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
+
   CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
   CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
 
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 0e16308a2..ace97057f 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -87,19 +87,14 @@ static void test_scalar()
   vec1.setRandom();
   vec2.setRandom();
 
-  Tensor<float, 1, DataLayout> scalar(1);
-  scalar.setZero();
   Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
-  typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
-  Evaluator eval(vec1.contract(vec2, dims), DefaultDevice());
-  eval.evalTo(scalar.data());
-  EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims);
 
   float expected = 0.0f;
   for (int i = 0; i < 6; ++i) {
     expected += vec1(i) * vec2(i);
   }
-  VERIFY_IS_APPROX(scalar(0), expected);
+  VERIFY_IS_APPROX(scalar(), expected);
 }
 
 template<int DataLayout>
@@ -494,6 +489,27 @@ static void test_tensor_product()
 }
 
 
+template<int DataLayout>
+static void test_const_inputs()
+{
+  Tensor<float, 2, DataLayout> in1(2, 3);
+  Tensor<float, 2, DataLayout> in2(3, 2);
+  in1.setRandom();
+  in2.setRandom();
+
+  TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+  TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
 void test_cxx11_tensor_contraction()
 {
   CALL_SUBTEST(test_evals<ColMajor>());
@@ -524,4 +540,6 @@ void test_cxx11_tensor_contraction()
   CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
   CALL_SUBTEST(test_tensor_product<ColMajor>());
   CALL_SUBTEST(test_tensor_product<RowMajor>());
+  CALL_SUBTEST(test_const_inputs<ColMajor>());
+  CALL_SUBTEST(test_const_inputs<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 4026f48f0..bf216587a 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -10,19 +10,65 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
+void test_cuda_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  cudaMalloc((void**)(&d_in1), tensor_bytes);
+  cudaMalloc((void**)(&d_in2), tensor_bytes);
+  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
 void test_cuda_elementwise_small() {
-  Tensor<float, 1> in1(Eigen::array<int, 1>(2));
-  Tensor<float, 1> in2(Eigen::array<int, 1>(2));
-  Tensor<float, 1> out(Eigen::array<int, 1>(2));
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
   in1.setRandom();
   in2.setRandom();
 
@@ -44,11 +90,11 @@ void test_cuda_elementwise_small() {
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
-      d_in1, Eigen::array<int, 1>(2));
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
-      d_in2, Eigen::array<int, 1>(2));
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
-      d_out, Eigen::array<int, 1>(2));
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
@@ -58,8 +104,8 @@ void test_cuda_elementwise_small() {
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
-        out(Eigen::array<int, 1>(i)),
-        in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
   cudaFree(d_in1);
@@ -69,10 +115,10 @@ void test_cuda_elementwise_small() {
 
 void test_cuda_elementwise()
 {
-  Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   in1.setRandom();
   in2.setRandom();
   in3.setRandom();
@@ -98,10 +144,10 @@ void test_cuda_elementwise()
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
@@ -111,7 +157,7 @@ void test_cuda_elementwise()
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
       for (int k = 0; k < 97; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
       }
     }
   }
@@ -181,7 +227,7 @@ void test_cuda_reduction()
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
-  array<int, 2> reduction_axis;
+  array<Eigen::DenseIndex, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
 
@@ -214,8 +260,8 @@ void test_cuda_contraction()
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
-  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
-  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
 
   t_left.setRandom();
   t_right.setRandom();
@@ -299,7 +345,7 @@ void test_cuda_convolution_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
 
-  Eigen::array<int, 1> dims(1);
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -352,7 +398,7 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
 
-  Eigen::array<int, 1> dims(0);
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -405,7 +451,7 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
 
-  Eigen::array<int, 1> dims(3);
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -459,7 +505,7 @@ void test_cuda_convolution_2d()
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
 
-  Eigen::array<int, 2> dims(1,2);
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -496,9 +542,9 @@ void test_cuda_convolution_2d()
 template<int DataLayout>
 void test_cuda_convolution_3d()
 {
-  Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
-  Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
   input = input.constant(10.0f) + input.random();
   kernel = kernel.constant(7.0f) + kernel.random();
 
@@ -523,7 +569,7 @@ void test_cuda_convolution_3d()
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
 
-  Eigen::array<int, 3> dims(1,2,3);
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -1019,8 +1065,156 @@ void test_cuda_erfc(const Scalar stddev)
   cudaFree(d_out);
 }
 
+template <typename Scalar>
+void test_cuda_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_a), bytes);
+  cudaMalloc((void**)(&d_in_b), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_a);
+  cudaFree(d_in_b);
+  cudaFree(d_out);
+}
+
+
 void test_cxx11_tensor_cuda()
 {
+  CALL_SUBTEST_1(test_cuda_nullary());
   CALL_SUBTEST_1(test_cuda_elementwise_small());
   CALL_SUBTEST_1(test_cuda_elementwise());
   CALL_SUBTEST_1(test_cuda_props());
@@ -1086,5 +1280,8 @@ void test_cxx11_tensor_cuda()
 
   CALL_SUBTEST_5(test_cuda_igamma<double>());
   CALL_SUBTEST_5(test_cuda_igammac<double>());
+
+  CALL_SUBTEST_6(test_cuda_betainc<float>());
+  CALL_SUBTEST_6(test_cuda_betainc<double>());
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index cbe9e6449..fde20ddf2 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -241,7 +243,7 @@ void test_cpu() {
         const float result = out(i,j,k);
         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
                                (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
           continue;
         }
         VERIFY_IS_APPROX(expected, result);
@@ -258,7 +260,7 @@ void test_cpu() {
                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
                                (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
           continue;
         }
         VERIFY_IS_APPROX(expected, result);
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 421e73693..16f168ed4 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -21,7 +21,7 @@ static void test_dynamic_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
   VERIFY_IS_EQUAL((int)dimensions[0], 2);
   VERIFY_IS_EQUAL((int)dimensions[1], 3);
   VERIFY_IS_EQUAL((int)dimensions[2], 7);
@@ -34,12 +34,12 @@ static void test_fixed_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
 }
 
 static void test_match()
 {
-  Eigen::DSizes<int, 3> dyn(2,3,7);
+  Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
   Eigen::Sizes<2,3,7> stat;
   VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
 
@@ -51,13 +51,13 @@ static void test_match()
 static void test_rank_zero()
 {
   Eigen::Sizes<> scalar;
-  VERIFY_IS_EQUAL(scalar.TotalSize(), 1);
-  VERIFY_IS_EQUAL(scalar.rank(), 0);
-  VERIFY_IS_EQUAL(internal::array_prod(scalar), 1);
+  VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
 
   Eigen::DSizes<ptrdiff_t, 0> dscalar;
-  VERIFY_IS_EQUAL(dscalar.TotalSize(), 1);
-  VERIFY_IS_EQUAL(dscalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
 void test_cxx11_tensor_dimension()
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 8389e9840..77e24cb67 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -16,8 +16,8 @@ using Eigen::RowMajor;
 
 static void test_1d()
 {
-  Tensor<float, 1> vec1({6});
-  Tensor<float, 1, RowMajor> vec2({6});
+  Tensor<float, 1> vec1(6);
+  Tensor<float, 1, RowMajor> vec2(6);
 
   vec1(0) = 4.0;  vec2(0) = 0.0;
   vec1(1) = 8.0;  vec2(1) = 1.0;
@@ -112,13 +112,13 @@ static void test_3d()
   Tensor<float, 3> mat1(2,3,7);
   Tensor<float, 3, RowMajor> mat2(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
         mat2(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -142,7 +142,7 @@ static void test_3d()
   Tensor<float, 3, RowMajor> mat11(2,3,7);
   mat11 = mat2 / 3.14f;
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -155,7 +155,7 @@ static void test_3d()
         VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
         VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
         VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -167,25 +167,25 @@ static void test_constants()
   Tensor<float, 3> mat2(2,3,7);
   Tensor<float, 3> mat3(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
   mat2 = mat1.constant(3.14f);
   mat3 = mat1.cwiseMax(7.3f).exp();
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
         VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -228,25 +228,25 @@ static void test_functors()
   Tensor<float, 3> mat2(2,3,7);
   Tensor<float, 3> mat3(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
   mat2 = mat1.inverse().unaryExpr(&asinf);
   mat3 = mat1.unaryExpr(&tanhf);
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
         VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 89874349f..2f14ebc62 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -205,15 +205,15 @@ static void test_fft_real_input_energy() {
     VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
   }
 
-  float energy_original = 0.0;
-  float energy_after_fft = 0.0;
+  RealScalar energy_original = 0.0;
+  RealScalar energy_after_fft = 0.0;
 
   for (int i = 0; i < total_size; ++i) {
-    energy_original += pow(std::abs(input(i)), 2);
+    energy_original += numext::abs2(input(i));
   }
 
   for (int i = 0; i < total_size; ++i) {
-    energy_after_fft += pow(std::abs(output(i)), 2);
+    energy_after_fft += numext::abs2(output(i));
   }
 
   if(FFTDirection == FFT_FORWARD) {
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 46d741b05..4c660de65 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -188,13 +188,13 @@ static void test_3d()
   //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
   //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
 
-  float val = 0.0;
+  float val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
         mat2(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -210,13 +210,13 @@ static void test_3d()
   //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
 
 
-  val = 0.0;
+  val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
         VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -226,12 +226,12 @@ static void test_3d()
 static void test_array()
 {
   TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
-  float val = 0.0;
+  float val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -239,12 +239,12 @@ static void test_array()
   TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
   mat3 = mat1.pow(3.5f);
 
-  val = 0.0;
+  val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 5d6a49181..988b01481 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -568,13 +568,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 16);
 
   // RowMajor
-  Tensor<float, 4, RowMajor> l_in_row_major = l_in.swap_layout();
-  VERIFY_IS_EQUAL(l_in.dimension(0), l_in_row_major.dimension(3));
-  VERIFY_IS_EQUAL(l_in.dimension(1), l_in_row_major.dimension(2));
-  VERIFY_IS_EQUAL(l_in.dimension(2), l_in_row_major.dimension(1));
-  VERIFY_IS_EQUAL(l_in.dimension(3), l_in_row_major.dimension(0));
-
-  Tensor<float, 5, RowMajor> l_out_row_major = l_in_row_major.extract_image_patches(11, 11);
+  Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
@@ -589,10 +583,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 11; ++r) {
             for (int d = 0; d < 3; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
                 expected = l_in(d, r-5+i, c-5+j, b);
-                expected_row_major = l_in_row_major(b, c-5+j, r-5+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -601,15 +593,13 @@ static void test_imagenet_patches()
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
               if (l_out_row_major(b, patchId, c, r, d) !=
-                  expected_row_major) {
+                  expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j
                      << " r=" << r << " c=" << c << " d=" << d << " b=" << b
                      << std::endl;
               }
               VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
-                              expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+                              expected);
             }
           }
         }
@@ -628,8 +618,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(9, 9);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
@@ -644,10 +633,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 9; ++r) {
             for (int d = 0; d < 16; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
                 expected = l_in(d, r-4+i, c-4+j, b);
-                expected_row_major = l_in_row_major(b, c-4+j, r-4+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -655,12 +642,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
@@ -679,8 +664,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(7, 7);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
@@ -695,10 +679,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 7; ++r) {
             for (int d = 0; d < 32; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
                 expected = l_in(d, r-3+i, c-3+j, b);
-                expected_row_major = l_in_row_major(b, c-3+j, r-3+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -706,12 +688,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
@@ -730,8 +710,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(3, 3);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
@@ -746,10 +725,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 3; ++r) {
             for (int d = 0; d < 64; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
                 expected = l_in(d, r-1+i, c-1+j, b);
-                expected_row_major = l_in_row_major(b, c-1+j, r-1+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -757,12 +734,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 4ce8dea20..4cf5df666 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -159,6 +159,111 @@ static void test_type2index_list()
 }
 
 
+static void test_type2indexpair_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+
+  Dims0 d0;
+  Dims2_a d2_a;
+
+  Dims2_b d2_b;
+  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+
+  Dims2_c d2_c;
+  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+
+  VERIFY_IS_EQUAL(d2_a[0].first, 0);
+  VERIFY_IS_EQUAL(d2_a[0].second, 10);
+  VERIFY_IS_EQUAL(d2_a[1].first, 1);
+  VERIFY_IS_EQUAL(d2_a[1].second, 11);
+  VERIFY_IS_EQUAL(d2_a[2].first, 2);
+  VERIFY_IS_EQUAL(d2_a[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_b[0].first, 0);
+  VERIFY_IS_EQUAL(d2_b[0].second, 10);
+  VERIFY_IS_EQUAL(d2_b[1].first, 1);
+  VERIFY_IS_EQUAL(d2_b[1].second, 11);
+  VERIFY_IS_EQUAL(d2_b[2].first, 2);
+  VERIFY_IS_EQUAL(d2_b[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_c[0].first, 0);
+  VERIFY_IS_EQUAL(d2_c[0].second, 10);
+  VERIFY_IS_EQUAL(d2_c[1].first, 1);
+  VERIFY_IS_EQUAL(d2_c[1].second, 11);
+  VERIFY_IS_EQUAL(d2_c[2].first, 2);
+  VERIFY_IS_EQUAL(d2_c[2].second, 12);
+
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 10) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+}
+
+
 static void test_dynamic_index_list()
 {
   Tensor<float, 4> tensor(2,3,5,7);
@@ -273,6 +378,7 @@ void test_cxx11_tensor_index_list()
 #ifdef EIGEN_HAS_INDEX_LIST
   CALL_SUBTEST(test_static_index_list());
   CALL_SUBTEST(test_type2index_list());
+  CALL_SUBTEST(test_type2indexpair_list());
   CALL_SUBTEST(test_dynamic_index_list());
   CALL_SUBTEST(test_mixed_index_list());
   CALL_SUBTEST(test_dim_check());
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 48aa6d368..8e2b70b75 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -128,7 +128,7 @@ void test_powers_64bit() {
 void test_specific() {
   // A particular combination that was previously failing
   int64_t div = 209715200;
-  int64_t num = 3238002688;
+  int64_t num = 3238002688ll;
   Eigen::internal::TensorIntDivisor<int64_t> divider(div);
   int64_t result = num/div;
   int64_t result_op = divider.divide(num);
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 8bbcf7089..489960529 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -13,6 +13,20 @@
 #include <Eigen/CXX11/Tensor>
 
 
+template<int DataLayout>
+static void test_output_0d()
+{
+  Tensor<int, 0, DataLayout> tensor;
+  tensor() = 123;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("123");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
 template<int DataLayout>
 static void test_output_1d()
 {
@@ -26,6 +40,12 @@ static void test_output_1d()
 
   std::string expected("0\n1\n2\n3\n4");
   VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+  std::stringstream empty_os;
+  empty_os << empty_tensor;
+  std::string empty_string;
+  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
 }
 
 
@@ -101,6 +121,8 @@ static void test_output_const()
 
 void test_cxx11_tensor_io()
 {
+  CALL_SUBTEST(test_output_0d<ColMajor>());
+  CALL_SUBTEST(test_output_0d<RowMajor>());
   CALL_SUBTEST(test_output_1d<ColMajor>());
   CALL_SUBTEST(test_output_1d<RowMajor>());
   CALL_SUBTEST(test_output_2d<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index eb3b891fd..f7de43110 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -13,6 +13,7 @@
 
 using Eigen::Tensor;
 
+template<typename>
 static void test_simple_reshape()
 {
   Tensor<float, 5> tensor1(2,3,1,7,1);
@@ -40,7 +41,7 @@ static void test_simple_reshape()
   }
 }
 
-
+template<typename>
 static void test_reshape_in_expr() {
   MatrixXf m1(2,3*5*7*11);
   MatrixXf m2(3*5*7*11,13);
@@ -65,7 +66,7 @@ static void test_reshape_in_expr() {
   }
 }
 
-
+template<typename>
 static void test_reshape_as_lvalue()
 {
   Tensor<float, 3> tensor(2,3,7);
@@ -114,6 +115,7 @@ static void test_simple_slice()
   }
 }
 
+template<typename=void>
 static void test_const_slice()
 {
   const float b[1] = {42};
@@ -315,6 +317,128 @@ static void test_slice_raw_data()
   VERIFY_IS_EQUAL(slice6.data(), tensor.data());
 }
 
+
+template<int DataLayout>
+static void test_strided_slice()
+{
+  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<float, 2, DataLayout> tensor2(7,11);
+  tensor.setRandom();
+  tensor2.setRandom();
+
+  if (true) {
+    Tensor2f slice(2,3);
+    Index2 strides(-2,-1);
+    Index2 indicesStart(5,7);
+    Index2 indicesStop(0,4);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(5-2*j,7-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor2f slice(0,1);
+    Index2 strides(1,1);
+    Index2 indicesStart(5,4);
+    Index2 indicesStop(5,5);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+  }
+
+  if(true) { // test clamped degenerate interavls
+    Tensor2f slice(7,11);
+    Index2 strides(1,-1);
+    Index2 indicesStart(-3,20); // should become 0,10
+    Index2 indicesStop(20,-11); // should become 11, -1
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(j,10-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice1(1,1,1,1,1);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStart(1, 2, 3, 4, 5);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStop(2, 3, 4, 5, 6);
+    Eigen::DSizes<Eigen::DenseIndex, 5> strides(1, 1, 1, 1, 1);
+    slice1 = tensor.stridedSlice(indicesStart, indicesStop, strides);
+    VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 start(1, 1, 3, 4, 5);
+    Index5 stop(2, 2, 5, 6, 8);
+    Index5 strides(1, 1, 1, 1, 1);
+    slice = tensor.stridedSlice(start, stop, strides);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        }
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, -2, 1, -1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,4-2*i,4+j,7-k));
+        }
+      }
+    }
+  }
+
+  if(false) { // tests degenerate interval
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, 2, 1, 1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+  }
+}
+
+template<int DataLayout>
+static void test_strided_slice_write()
+{
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+
+  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  tensor.setRandom();
+  tensor2=tensor;
+  Tensor2f slice(2,3);
+
+  slice.setRandom();
+
+  Index2 strides(1,1);
+  Index2 indicesStart(3,4);
+  Index2 indicesStop(5,7);
+  Index2 lengths(2,3);
+
+  tensor.slice(indicesStart,lengths)=slice;
+  tensor2.stridedSlice(indicesStart,indicesStop,strides)=slice;
+
+  for(int i=0;i<7;i++) for(int j=0;j<11;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+}
+
+
 template<int DataLayout>
 static void test_composition()
 {
@@ -337,20 +461,25 @@ static void test_composition()
 
 void test_cxx11_tensor_morphing()
 {
-  CALL_SUBTEST(test_simple_reshape());
-  CALL_SUBTEST(test_reshape_in_expr());
-  CALL_SUBTEST(test_reshape_as_lvalue());
+  CALL_SUBTEST_1(test_simple_reshape<void>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
 
-  CALL_SUBTEST(test_simple_slice<ColMajor>());
-  CALL_SUBTEST(test_simple_slice<RowMajor>());
-  CALL_SUBTEST(test_const_slice());
-  CALL_SUBTEST(test_slice_in_expr<ColMajor>());
-  CALL_SUBTEST(test_slice_in_expr<RowMajor>());
-  CALL_SUBTEST(test_slice_as_lvalue<ColMajor>());
-  CALL_SUBTEST(test_slice_as_lvalue<RowMajor>());
-  CALL_SUBTEST(test_slice_raw_data<ColMajor>());
-  CALL_SUBTEST(test_slice_raw_data<RowMajor>());
+  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+  CALL_SUBTEST_1(test_const_slice());
+  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
 
-  CALL_SUBTEST(test_composition<ColMajor>());
-  CALL_SUBTEST(test_composition<RowMajor>());
+  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
+  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
+
+  CALL_SUBTEST_7(test_composition<ColMajor>());
+  CALL_SUBTEST_7(test_composition<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 37fe3e9a4..2f86980a2 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,14 +13,55 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
+template<typename>
+void test_cuda_numext() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
 #ifdef EIGEN_HAS_CUDA_FP16
 
+template<typename>
 void test_cuda_conversion() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -55,7 +96,7 @@ void test_cuda_conversion() {
   gpu_device.deallocate(d_conv);
 }
 
-
+template<typename>
 void test_cuda_unary() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -92,7 +133,7 @@ void test_cuda_unary() {
   gpu_device.deallocate(d_res_float);
 }
 
-
+template<typename>
 void test_cuda_elementwise() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -124,8 +165,8 @@ void test_cuda_elementwise() {
   gpu_device.synchronize();
 
   for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking elemwise " << i << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
   }
 
   gpu_device.deallocate(d_float1);
@@ -134,6 +175,7 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_float);
 }
 
+template<typename>
 void test_cuda_trancendental() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -141,43 +183,58 @@ void test_cuda_trancendental() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res1_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res1_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res2_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res2_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
-      d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
-      d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res1_half(
-      d_res1_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res1_float(
-      d_res1_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res2_half(
-      d_res2_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res2_float(
-      d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
-  gpu_res1_float.device(gpu_device) = gpu_float1.exp();
-  gpu_res2_float.device(gpu_device) = gpu_float2.log();
-  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp().cast<float>();
-  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log().cast<float>();
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
   Tensor<float, 1> input1(num_elem);
-  Tensor<float, 1> half_prec1(num_elem);
-  Tensor<float, 1> full_prec1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
   Tensor<float, 1> input2(num_elem);
-  Tensor<float, 1> half_prec2(num_elem);
-  Tensor<float, 1> full_prec2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
   gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < num_elem; ++i) {
@@ -186,17 +243,27 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
   }
   gpu_device.deallocate(d_float1);
   gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
   gpu_device.deallocate(d_res1_half);
   gpu_device.deallocate(d_res1_float);
   gpu_device.deallocate(d_res2_half);
   gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
 }
 
-
+template<typename>
 void test_cuda_contractions() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -206,36 +273,38 @@ void test_cuda_contractions() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
 
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
       d_float1, rows, cols);
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
       d_float2, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
       d_res_half, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
       d_res_float, rows, cols);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
 
   typedef Tensor<float, 2>::DimensionPair DimPair;
   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims);
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims).cast<float>();
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
 
-  Tensor<float, 2> half_prec(rows, cols);
-  Tensor<float, 2> full_prec(rows, cols);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < rows; ++i) {
     for (int j = 0; j < cols; ++j) {
-      std::cout << "Checking contract " << i << " " << j << std::endl;
-      VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
     }
   }
 
@@ -245,42 +314,46 @@ void test_cuda_contractions() {
   gpu_device.deallocate(d_res_float);
 }
 
+template<typename>
+void test_cuda_reductions(int size1, int size2, int redux) {
+
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl; 
 
-void test_cuda_reductions() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
-  int size = 13;
-  int num_elem = size*size;
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
 
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
+      d_float1, size1, size2);
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
-      d_res_float, size);
+      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
+  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
 
-  Eigen::array<int, 1> redux_dim = {{0}};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim);
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim).cast<float>();
+  Eigen::array<int, 1> redux_dim = {{redux}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
 
-  Tensor<float, 1> half_prec(size);
-  Tensor<float, 1> full_prec(size);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float));
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
   gpu_device.synchronize();
 
-  for (int i = 0; i < size; ++i) {
-    std::cout << "Checking redux " << i << std::endl;
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
@@ -290,6 +363,68 @@ void test_cuda_reductions() {
   gpu_device.deallocate(d_res_float);
 }
 
+template<typename>
+void test_cuda_reductions() {
+  test_cuda_reductions<void>(13, 13, 0);
+  test_cuda_reductions<void>(13, 13, 1);
+
+  test_cuda_reductions<void>(35, 36, 0);
+  test_cuda_reductions<void>(35, 36, 1);
+
+  test_cuda_reductions<void>(36, 35, 0);
+  test_cuda_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_cuda_full_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
 void test_cuda_forced_evals() {
 
   Eigen::CudaStreamDevice stream;
@@ -297,59 +432,62 @@ void test_cuda_forced_evals() {
   int num_elem = 101;
 
   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
       d_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
-      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, num_elem);
 
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
   gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
 
-  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
   Tensor<float, 1> full_prec(num_elem);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
   gpu_device.synchronize();
 
   for (int i = 0; i < num_elem; ++i) {
-    std::cout << "Checking unary " << i << std::endl;
-    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
   }
 
   gpu_device.deallocate(d_float);
-  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_half1);
+  gpu_device.deallocate(d_res_half2);
   gpu_device.deallocate(d_res_float);
 }
-
 #endif
 
 
 void test_cxx11_tensor_of_float16_cuda()
 {
-#ifdef EIGEN_HAS_CUDA_FP16
-  Eigen::CudaStreamDevice stream;
-  Eigen::GpuDevice device(&stream);
-  if (device.majorDeviceVersion() > 5 ||
-      (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
-    std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl;
+  CALL_SUBTEST_1(test_cuda_numext<void>());
 
-    CALL_SUBTEST_1(test_cuda_conversion());
-    CALL_SUBTEST_1(test_cuda_unary());
-    CALL_SUBTEST_1(test_cuda_elementwise());
-    CALL_SUBTEST_1(test_cuda_trancendental());
-    CALL_SUBTEST_2(test_cuda_contractions());
-    CALL_SUBTEST_3(test_cuda_reductions());
-    CALL_SUBTEST_4(test_cuda_forced_evals());
-  }
-  else {
-   std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
-  }
+#ifdef EIGEN_HAS_CUDA_FP16
+  CALL_SUBTEST_1(test_cuda_conversion<void>());
+  CALL_SUBTEST_1(test_cuda_unary<void>());
+  CALL_SUBTEST_1(test_cuda_elementwise<void>());
+  CALL_SUBTEST_1(test_cuda_trancendental<void>());
+  CALL_SUBTEST_2(test_cuda_contractions<void>());
+  CALL_SUBTEST_3(test_cuda_reductions<void>());
+  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
+  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
 #else
   std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
 #endif
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index 5d091de15..b3be199e1 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -13,10 +13,61 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
-static void test_default()
+
+void test_cuda_random_uniform()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  gpu_out.device(gpu_device) = gpu_out.random();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+}
+
+
+void test_cuda_random_normal()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  Eigen::internal::NormalRandomGenerator<float> gen(true);
+  gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+}
+
+static void test_complex()
 {
   Tensor<std::complex<float>, 1> vec(6);
   vec.setRandom();
@@ -31,5 +82,7 @@ static void test_default()
 
 void test_cxx11_tensor_random_cuda()
 {
-  CALL_SUBTEST(test_default());
+  CALL_SUBTEST(test_cuda_random_uniform());
+  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_complex());
 }
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 6a128901a..1490ec3da 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -239,6 +239,33 @@ static void test_simple_reductions() {
   }
 }
 
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
 template <int DataLayout>
 static void test_full_reductions() {
   Tensor<float, 2, DataLayout> tensor(2, 3);
@@ -341,7 +368,7 @@ static void test_static_dims() {
   Tensor<float, 2, DataLayout> out(72, 97);
   in.setRandom();
 
-#ifndef EIGEN_HAS_CONSTEXPR 
+#if !EIGEN_HAS_CONSTEXPR 
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
@@ -371,7 +398,7 @@ static void test_innermost_last_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#ifndef EIGEN_HAS_CONSTEXPR
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 0;
   reduction_axis[1] = 1;
@@ -402,7 +429,7 @@ static void test_innermost_first_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#ifndef EIGEN_HAS_CONSTEXPR
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 2;
   reduction_axis[1] = 3;
@@ -433,7 +460,7 @@ static void test_reduce_middle_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#ifndef EIGEN_HAS_CONSTEXPR
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 2;
@@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() {
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
   CALL_SUBTEST(test_simple_reductions<ColMajor>());
   CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
   CALL_SUBTEST(test_full_reductions<RowMajor>());
   CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6858b43a7 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -12,11 +12,14 @@
 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 
-template<int DataLayout>
+template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
   Eigen::CudaStreamDevice stream;
@@ -25,24 +28,24 @@ static void test_full_reductions() {
   const int num_rows = internal::random<int>(1024, 5*1024);
   const int num_cols = internal::random<int>(1024, 5*1024);
 
-  Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
   in.setRandom();
 
-  Tensor<float, 0, DataLayout> full_redux;
+  Tensor<Type, 0, DataLayout> full_redux;
   full_redux = in.sum();
 
-  std::size_t in_bytes = in.size() * sizeof(float);
-  std::size_t out_bytes = full_redux.size() * sizeof(float);
-  float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
-  float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
 
-  TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
 
   out_gpu.device(gpu_device) = in_gpu.sum();
 
-  Tensor<float, 0, DataLayout> full_redux_gpu;
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
   gpu_device.synchronize();
 
@@ -53,7 +56,102 @@ static void test_full_reductions() {
   gpu_device.deallocate(gpu_out_ptr);
 }
 
-void test_cxx11_tensor_reduction_cuda() {
-  CALL_SUBTEST_1(test_full_reductions<ColMajor>());
-  CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
+void test_cxx11_tensor_reduction_cuda() {
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
 }
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
new file mode 100644
index 000000000..af59aa3ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout, typename Type=float, bool Exclusive = false>
+static void test_1d_scan()
+{
+  int size = 50;
+  Tensor<Type, 1, DataLayout> tensor(size);
+  tensor.setRandom();
+  Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
+
+  VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum += tensor(i);
+    } else {
+      accum += tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+
+  accum = 1;
+  result = tensor.cumprod(0, Exclusive);
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum *= tensor(i);
+    } else {
+      accum *= tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+}
+
+template <int DataLayout, typename Type=float>
+static void test_4d_scan()
+{
+  int size = 5;
+  Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+  tensor.setRandom();
+
+  Tensor<Type, 4, DataLayout> result(size, size, size, size);
+
+  result = tensor.cumsum(0);
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(i, 1, 2, 3);
+    VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+  }
+  result = tensor.cumsum(1);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, i, 2, 3);
+    VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+  }
+  result = tensor.cumsum(2);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, i, 3);
+    VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+  }
+  result = tensor.cumsum(3);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, 3, i);
+    VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[20];
+  TensorMap<Tensor<int, 1, DataLayout> > tensor_map(inputs, 20);
+  tensor_map.setRandom();
+
+  Tensor<int, 1, DataLayout> result = tensor_map.cumsum(0);
+
+  int accum = 0;
+  for (int i = 0; i < 20; ++i) {
+    accum += tensor_map(i);
+    VERIFY_IS_EQUAL(result(i), accum);
+  }
+}
+
+void test_cxx11_tensor_scan() {
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
+  CALL_SUBTEST(test_4d_scan<ColMajor>());
+  CALL_SUBTEST(test_4d_scan<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
new file mode 100644
index 000000000..761d11fd1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_input), t_input_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_input);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index a03f75cfe..2f56eb495 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -33,7 +33,7 @@ static void test_comparison_sugar() {
 }
 
 
-static void test_scalar_sugar() {
+static void test_scalar_sugar_add_mul() {
   Tensor<float, 3> A(6, 7, 5);
   Tensor<float, 3> B(6, 7, 5);
   A.setRandom();
@@ -41,21 +41,41 @@ static void test_scalar_sugar() {
 
   const float alpha = 0.43f;
   const float beta = 0.21f;
+  const float gamma = 0.14f;
 
-  Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta);
-  Tensor<float, 3> S = A * alpha + B * beta;
+  Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta + gamma;
+  Tensor<float, 3> T = gamma + alpha * A + beta * B;
 
-  // TODO: add enough syntactic sugar to support this
-  // Tensor<float, 3> T = alpha * A + beta * B;
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+    VERIFY_IS_APPROX(R(i), T(i));
+  }
+}
+
+static void test_scalar_sugar_sub_div() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+  const float delta = 0.32f;
+
+  Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+      - B.constant(beta) / B - A.constant(delta);
+  Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
 
   for (int i = 0; i < 6*7*5; ++i) {
     VERIFY_IS_APPROX(R(i), S(i));
   }
 }
 
-
 void test_cxx11_tensor_sugar()
 {
   CALL_SUBTEST(test_comparison_sugar());
-  CALL_SUBTEST(test_scalar_sugar());
+  CALL_SUBTEST(test_scalar_sugar_add_mul());
+  CALL_SUBTEST(test_scalar_sugar_sub_div());
 }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e46197464..2ef665f30 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -91,7 +91,7 @@ void test_multithread_contraction()
 
  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
-    if (fabs(t_result(i) - m_result(i)) < 1e-4) {
+    if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
       continue;
     }
     if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
@@ -132,7 +132,7 @@ void test_contraction_corner_cases()
 
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     assert(!(numext::isnan)(t_result.data()[i]));
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
       std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
@@ -147,7 +147,7 @@ void test_contraction_corner_cases()
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     assert(!(numext::isnan)(t_result.data()[i]));
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
@@ -165,7 +165,7 @@ void test_contraction_corner_cases()
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     assert(!(numext::isnan)(t_result.data()[i]));
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
@@ -183,7 +183,7 @@ void test_contraction_corner_cases()
   m_result = m_left.transpose() * m_right;
   for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     assert(!(numext::isnan)(t_result.data()[i]));
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
       std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
@@ -226,13 +226,49 @@ void test_multithread_contraction_agrees_with_singlethread() {
   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
     // if both of the values are very small, then do nothing (because the test will fail
     // due to numerical precision issues when values are small)
-    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) {
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
     }
   }
 }
 
 
+template<int DataLayout>
+void test_full_contraction() {
+  int contract_size1 = internal::random<int>(1, 500);
+  int contract_size2 = internal::random<int>(1, 500);
+
+  Tensor<float, 2, DataLayout> left(contract_size1,
+                                    contract_size2);
+  Tensor<float, 2, DataLayout> right(contract_size1,
+                                    contract_size2);
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 0, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 0, DataLayout> tp_result;
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  // if both of the values are very small, then do nothing (because the test will fail
+  // due to numerical precision issues when values are small)
+  if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
+    VERIFY_IS_APPROX(st_result(), tp_result());
+  }
+}
+
 template<int DataLayout>
 void test_multithreaded_reductions() {
   const int num_threads = internal::random<int>(3, 11);
@@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool()
   CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
   CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
 
+  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+
   CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
   CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
 
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 02411a262..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
 #include <Eigen/SparseExtra>
 #include <Eigen/KroneckerProduct>
 
-
 template<typename MatrixType>
 void check_dimension(const MatrixType& ab, const int rows,  const int cols)
 {
@@ -230,3 +230,23 @@ void test_kronecker_product()
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
   }
 }
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 9a995f941..7c9b68a3c 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
 
   MatrixType scaledA;
   RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
-    scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= RealScalar(0.9L * EIGEN_PI))
+    scaledA = A * RealScalar(0.9L * EIGEN_PI) / maxImagPartOfSpectrum;
   else
     scaledA = A;
 
diff --git a/unsupported/test/matrix_functions.h b/unsupported/test/matrix_functions.h
index 150b4c0c5..4e2636404 100644
--- a/unsupported/test/matrix_functions.h
+++ b/unsupported/test/matrix_functions.h
@@ -61,7 +61,7 @@ struct generateTestMatrix<MatrixType,1>
 };
 
 template <typename Derived, typename OtherDerived>
-double relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
+typename Derived::RealScalar relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
 {
   return std::sqrt((A - B).cwiseAbs2().sum() / (std::min)(A.cwiseAbs2().sum(), B.cwiseAbs2().sum()));
 }
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 8e104ed1e..7ccfacfdf 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -10,7 +10,7 @@
 #include "matrix_functions.h"
 
 template<typename T>
-void test2dRotation(double tol)
+void test2dRotation(const T& tol)
 {
   Matrix<T,2,2> A, B, C;
   T angle, c, s;
@@ -19,19 +19,19 @@ void test2dRotation(double tol)
   MatrixPower<Matrix<T,2,2> > Apow(A);
 
   for (int i=0; i<=20; ++i) {
-    angle = pow(10, (i-10) / 5.);
+    angle = std::pow(T(10), (i-10) / T(5.));
     c = std::cos(angle);
     s = std::sin(angle);
     B << c, s, -s, c;
 
-    C = Apow(std::ldexp(angle,1) / EIGEN_PI);
+    C = Apow(std::ldexp(angle,1) / T(EIGEN_PI));
     std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
     VERIFY(C.isApprox(B, tol));
   }
 }
 
 template<typename T>
-void test2dHyperbolicRotation(double tol)
+void test2dHyperbolicRotation(const T& tol)
 {
   Matrix<std::complex<T>,2,2> A, B, C;
   T angle, ch = std::cosh((T)1);
@@ -53,7 +53,7 @@ void test2dHyperbolicRotation(double tol)
 }
 
 template<typename T>
-void test3dRotation(double tol)
+void test3dRotation(const T& tol)
 {
   Matrix<T,3,1> v;
   T angle;
@@ -61,13 +61,13 @@ void test3dRotation(double tol)
   for (int i=0; i<=20; ++i) {
     v = Matrix<T,3,1>::Random();
     v.normalize();
-    angle = pow(10, (i-10) / 5.);
+    angle = std::pow(T(10), (i-10) / T(5.));
     VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
   }
 }
 
 template<typename MatrixType>
-void testGeneral(const MatrixType& m, double tol)
+void testGeneral(const MatrixType& m, const typename MatrixType::RealScalar& tol)
 {
   typedef typename MatrixType::RealScalar RealScalar;
   MatrixType m1, m2, m3, m4, m5;
@@ -97,7 +97,7 @@ void testGeneral(const MatrixType& m, double tol)
 }
 
 template<typename MatrixType>
-void testSingular(const MatrixType& m_const, double tol)
+void testSingular(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
 {
   // we need to pass by reference in order to prevent errors with
   // MSVC for aligned data types ...
@@ -119,18 +119,18 @@ void testSingular(const MatrixType& m_const, double tol)
     MatrixPower<MatrixType> mpow(m);
 
     T = T.sqrt();
-    VERIFY(mpow(0.5).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+    VERIFY(mpow(0.5L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
 
     T = T.sqrt();
-    VERIFY(mpow(0.25).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+    VERIFY(mpow(0.25L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
 
     T = T.sqrt();
-    VERIFY(mpow(0.125).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+    VERIFY(mpow(0.125L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
   }
 }
 
 template<typename MatrixType>
-void testLogThenExp(const MatrixType& m_const, double tol)
+void testLogThenExp(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
 {
   // we need to pass by reference in order to prevent errors with
   // MSVC for aligned data types ...
@@ -154,14 +154,14 @@ void test_matrix_power()
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
-  CALL_SUBTEST_9(test2dRotation<long double>(1e-13)); 
+  CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
   CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
   CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
-  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14));
+  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
 
   CALL_SUBTEST_10(test3dRotation<double>(1e-13));
   CALL_SUBTEST_11(test3dRotation<float>(1e-5));
-  CALL_SUBTEST_12(test3dRotation<long double>(1e-13));
+  CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
 
   CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
@@ -171,10 +171,10 @@ void test_matrix_power()
   CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
   CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
   CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
-  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13));
+  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
-  CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13));
+  CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
@@ -184,10 +184,10 @@ void test_matrix_power()
   CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
   CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
   CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
-  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13));
+  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
-  CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13));
+  CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
@@ -197,8 +197,8 @@ void test_matrix_power()
   CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
   CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
   CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
-  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13));
+  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
-  CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13));
+  CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
 }
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index 9b0cf7268..8404f1ff8 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -99,7 +99,7 @@
 
 // Detect support for explicit converters.
 #if (__has_feature(cxx_explicit_conversions) || \
-       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR >= 5) || __cplusplus >= 201103L || \
+       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
        (defined(_MSC_VER) && _MSC_VER >= 1800))
 
     #define MPREAL_HAVE_EXPLICIT_CONVERTERS
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 1aa9e786a..ffa5691eb 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -17,6 +17,7 @@ void test_mpreal_support()
   std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
   std::cerr << "highest =         " << NumTraits<mpreal>::highest() << "\n";
   std::cerr << "lowest =          " << NumTraits<mpreal>::lowest() << "\n";
+  std::cerr << "digits10 =        " << NumTraits<mpreal>::digits10() << "\n";
 
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,100);
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
new file mode 100644
index 000000000..057fb3e92
--- /dev/null
+++ b/unsupported/test/special_functions.cpp
@@ -0,0 +1,345 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i)))
+      VERIFY_IS_APPROX( x(i), y(i) );
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Index rows = internal::random<Index>(1,30);
+  Index cols = 1;
+
+  // API
+  {
+    ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+    VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+    VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+    VERIFY_IS_APPROX(m1.erf(), erf(m1));
+    VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  // check special functions (comparing against numpy implementation)
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+
+    {
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+
+      // Test various propreties of igamma & igammac.  These are normalized
+      // gamma integrals where
+      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
+      //   igamma(a, x) = gamma(a, x) / Gamma(a)
+      // where Gamma and gamma are considered the standard unnormalized
+      // upper and lower incomplete gamma functions, respectively.
+      ArrayType a = m1.abs() + 2;
+      ArrayType x = m2.abs() + 2;
+      ArrayType zero = ArrayType::Zero(rows, cols);
+      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+      ArrayType a_m1 = a - one;
+      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+      // Gamma(a, 0) == Gamma(a)
+      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+      // Gamma(a, x) + gamma(a, x) == Gamma(a)
+      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+
+      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+    }
+
+    {
+      // Check exact values of igamma and igammac against a third party calculation.
+      Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+      Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+      // location i*6+j corresponds to a_s[i], x_s[j].
+      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                              {0.0, 0.6321205588285578, 0.7768698398515702,
+                              0.9816843611112658, 9.999500016666262e-05, 1.0},
+                              {0.0, 0.4275932955291202, 0.608374823728911,
+                              0.9539882943107686, 7.522076445089201e-07, 1.0},
+                              {0.0, 0.01898815687615381, 0.06564245437845008,
+                              0.5665298796332909, 4.166333347221828e-18, 1.0},
+                              {0.0, 0.9999780593618628, 0.9999899967080838,
+                              0.9999996219837988, 0.9991370418689945, 1.0},
+                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                              {1.0, 0.36787944117144233, 0.22313016014842982,
+                                0.018315638888734182, 0.9999000049998333, 0.0},
+                              {1.0, 0.5724067044708798, 0.3916251762710878,
+                                0.04601170568923136, 0.9999992477923555, 0.0},
+                              {1.0, 0.9810118431238462, 0.9343575456215499,
+                                0.4334701203667089, 1.0, 0.0},
+                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                                3.7801620118431334e-07, 0.0008629581310054535,
+                                0.0},
+                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j) {
+          if ((std::isnan)(igamma_s[i][j])) {
+            VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+          }
+
+          if ((std::isnan)(igammac_s[i][j])) {
+            VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+          }
+        }
+      }
+    }
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Check the zeta function against scipy.special.zeta
+  {
+    ArrayType x(7), q(7), res(7), ref(7);
+    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
+    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+  }
+
+  // digamma
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  {
+    ArrayType n(11), x(11), res(11), ref(11);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    if(sizeof(RealScalar)>=8) {  // double
+      // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+      CALL_SUBTEST( res = polygamma(n,x);  verify_component_wise(res, ref); );
+    }
+    else {
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+      CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+    }
+  }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+  {
+    // Inputs and ground truth generated with scipy via:
+    //   a = np.logspace(-3, 3, 5) - 1e-3
+    //   b = np.logspace(-3, 3, 5) - 1e-3
+    //   x = np.linspace(-0.1, 1.1, 5)
+    //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+    //
+    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    ArrayType a(125);
+    ArrayType b(125);
+    ArrayType x(125);
+    ArrayType v(125);
+    ArrayType res(125);
+
+    a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999;
+
+    b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+        0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+        999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+        0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999;
+
+    x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+        -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+        1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1;
+
+    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+        0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+        0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+        nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+        0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+        0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+        0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+        nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+        0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+        0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+        0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+        nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+        0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+        3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+    CALL_SUBTEST(res = betainc(a, b, x);
+                 verify_component_wise(res, v););
+  }
+
+  // Test various properties of betainc
+  {
+    ArrayType m1 = ArrayType::Random(32);
+    ArrayType m2 = ArrayType::Random(32);
+    ArrayType m3 = ArrayType::Random(32);
+    ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+    const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+    ArrayType a = (m1 * 4.0).exp();
+    ArrayType b = (m2 * 4.0).exp();
+    ArrayType x = m3.abs();
+
+    // betainc(a, 1, x) == x**a
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, one, x);
+        ArrayType expected = x.pow(a);
+        verify_component_wise(test, expected););
+
+    // betainc(1, b, x) == 1 - (1 - x)**b
+    CALL_SUBTEST(
+        ArrayType test = betainc(one, b, x);
+        ArrayType expected = one - (one - x).pow(b);
+        verify_component_wise(test, expected););
+
+    // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+        ArrayType expected = one;
+        verify_component_wise(test, expected););
+
+    // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+    CALL_SUBTEST(
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType expected = betainc(a, b, x) - num / denom + eps;
+        ArrayType test = betainc(a + one, b, x) + eps;
+        if (sizeof(Scalar) >= 8) { // double
+          verify_component_wise(test, expected);
+        } else {
+          // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+          verify_component_wise(test.head(8), expected.head(8));
+        });
+
+    // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+    CALL_SUBTEST(
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        ArrayType expected = betainc(a, b, x) + num / denom + eps;
+        ArrayType test = betainc(a, b + one, x) + eps;
+        verify_component_wise(test, expected););
+  }
+#endif
+}
+
+void test_special_functions()
+{
+  CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+}