Rebase to latest.

2025-09-29 09:43:13 +08:00 · 2016-02-01 19:32:31 -07:00 · 2016-02-01 19:32:31 -07:00 · f0fdefa96f
commit f0fdefa96f
parent 02db1228ed 64ce78c2ec
113 changed files with 2894 additions and 1270 deletions
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@ -19,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup CholmodSupport_Module CholmodSupport module
  *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the two following main factorization classes:
  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@ -17,7 +17,7 @@
 /** \ingroup Support_modules
  * \defgroup SPQRSupport_Module SuiteSparseQR module
  * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  *
  * \code
  * #include <Eigen/SPQRSupport>
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@ -19,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
  * \defgroup UmfPackSupport_Module UmfPackSupport module
  *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the following factorization class:
  * - class UmfPackLU: a multifrontal sequential LU factorization.
  *
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@ -273,9 +273,10 @@ class CholmodBase : public SparseSolverBase<Derived>
      const Index size = m_cholmodFactor->n;
      EIGEN_UNUSED_VARIABLE(size);
      eigen_assert(size==b.rows());
+      
+      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
+      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());

-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
      cholmod_dense b_cd = viewAsCholmod(b_ref);
      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
      if(!x_cd)
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -103,7 +103,7 @@ template<typename Derived> class ArrayBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const ArrayBase& other)
    {
      internal::call_assignment(derived(), other.derived());
@ -112,28 +112,28 @@ template<typename Derived> class ArrayBase
    
    /** Set all the entries to \a value.
      * \sa DenseBase::setConstant(), DenseBase::fill() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const Scalar &value)
    { Base::setConstant(value); return derived(); }

-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const Scalar& scalar);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const ArrayBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator*=(const ArrayBase<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator/=(const ArrayBase<OtherDerived>& other);

  public:
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;

    EIGEN_DEVICE_FUNC
    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@ -67,7 +67,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    inline Index innerStride() const { return m_expression.innerStride(); }

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

@ -80,13 +80,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
    }

    EIGEN_DEVICE_FUNC
@ -98,13 +98,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
    }

    template<int LoadMode>
@ -116,7 +116,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    template<int LoadMode>
    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+      m_expression.template writePacket<LoadMode>(rowId, colId, val);
    }

    template<int LoadMode>
@ -128,7 +128,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    template<int LoadMode>
    inline void writePacket(Index index, const PacketScalar& val)
    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      m_expression.template writePacket<LoadMode>(index, val);
    }

    template<typename Dest>
@ -145,11 +145,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
    EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
@ -195,7 +195,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                       const Scalar
                     >::type ScalarWithConstIfNotLvalue;

-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;

    EIGEN_DEVICE_FUNC
    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@ -210,7 +210,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    inline Index innerStride() const { return m_expression.innerStride(); }

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
    EIGEN_DEVICE_FUNC
    inline const Scalar* data() const { return m_expression.data(); }

@ -223,7 +223,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
    }

    EIGEN_DEVICE_FUNC
@ -241,13 +241,13 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index index)
    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
    }

    template<int LoadMode>
@ -259,7 +259,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    template<int LoadMode>
    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+      m_expression.template writePacket<LoadMode>(rowId, colId, val);
    }

    template<int LoadMode>
@ -271,7 +271,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    template<int LoadMode>
    inline void writePacket(Index index, const PacketScalar& val)
    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      m_expression.template writePacket<LoadMode>(index, val);
    }

    EIGEN_DEVICE_FUNC
@ -284,11 +284,11 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index)  */
    EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
    /** Forwards the resizing request to the nested expression
      * \sa DenseBase::resize(Index,Index)*/
    EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }

  protected:
    NestedExpressionType m_expression;
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -637,7 +637,7 @@ protected:
 ***************************************************************************/

 template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
  
@ -654,7 +654,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S
 }

 template<typename DstXprType, typename SrcXprType>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
 }
@ -688,26 +688,30 @@ struct Assignment;
 // does not has to bother about these annoying details.

 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
 {
  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
 {
  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
                     
 // Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  typename plain_matrix_type<Src>::type tmp(src);
  call_assignment_no_alias(dst, tmp, func);
 }

 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
  call_assignment_no_alias(dst, src, func);
 }
@ -715,14 +719,16 @@ EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& fun
 // by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
  call_assignment_no_alias(dst.expression(), src, func);
 }


 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
  enum {
    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
@ -747,13 +753,15 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const
  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
 {
  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }

 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
 {
  Index dstRows = src.rows();
  Index dstCols = src.cols();
@ -767,7 +775,8 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src
  Assignment<Dst,Src,Func>::run(dst, src, func);
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
@ -779,7 +788,8 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 {
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
    
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -129,8 +129,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
      : Impl(xpr, startRow, startCol)
    {
      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
-             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
    }

    /** Dynamic-size constructor
@ -221,15 +221,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    inline Scalar& coeffRef(Index rowId, Index colId)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index rowId, Index colId) const
    {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    EIGEN_DEVICE_FUNC
@ -242,39 +240,34 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    inline Scalar& coeffRef(Index index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index index) const
    {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    EIGEN_DEVICE_FUNC
    inline const CoeffReturnType coeff(Index index) const
    {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    template<int LoadMode>
    inline PacketScalar packet(Index rowId, Index colId) const
    {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
    }

    template<int LoadMode>
    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
    }

    template<int LoadMode>
@ -288,7 +281,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    template<int LoadMode>
    inline void writePacket(Index index, const PacketScalar& val)
    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
+      m_xpr.template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
    }
@ -320,7 +313,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H

  protected:

-    const typename XprType::Nested m_xpr;
+    typename XprType::Nested m_xpr;
    const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
    const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -148,7 +148,8 @@ struct evaluator<PlainObjectBase<Derived> >
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    if (IsRowMajor)
      return m_data[row * m_outerStride.value() + col];
@ -156,12 +157,14 @@ struct evaluator<PlainObjectBase<Derived> >
      return m_data[row + col * m_outerStride.value()];
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_data[index];
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    if (IsRowMajor)
      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
@ -169,12 +172,14 @@ struct evaluator<PlainObjectBase<Derived> >
      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return const_cast<Scalar*>(m_data)[index];
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    if (IsRowMajor)
@ -184,12 +189,14 @@ struct evaluator<PlainObjectBase<Derived> >
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return ploadt<PacketType, LoadMode>(m_data + index);
  }

  template<int StoreMode,typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
    if (IsRowMajor)
@ -201,6 +208,7 @@ struct evaluator<PlainObjectBase<Derived> >
  }

  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
@ -260,45 +268,53 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_argImpl.coeff(col, row);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_argImpl.coeff(index);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    return m_argImpl.coeffRef(col, row);
  }

-  EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename XprType::Scalar& coeffRef(Index index)
  {
    return m_argImpl.coeffRef(index);
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_argImpl.template packet<LoadMode,PacketType>(index);
  }

-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
  }

-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
@ -338,23 +354,27 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >

  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_functor(row, col);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_functor(index);
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_functor.template packetOp<Index,PacketType>(row, col);
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_functor.template packetOp<Index,PacketType>(index);
@ -380,7 +400,8 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
    Alignment = evaluator<ArgType>::Alignment
  };

-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& op)
    : m_functor(op.functor()), 
      m_argImpl(op.nestedExpression()) 
  {
@ -390,23 +411,27 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >

  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_functor(m_argImpl.coeff(row, col));
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_functor(m_argImpl.coeff(index));
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
@ -466,17 +491,20 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase

  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
@ -484,6 +512,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
@ -523,22 +552,26 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_unaryOp(m_argImpl.coeff(row, col));
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_unaryOp(m_argImpl.coeff(index));
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    return m_unaryOp(m_argImpl.coeffRef(row, col));
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return m_unaryOp(m_argImpl.coeffRef(index));
  }
@ -578,47 +611,55 @@ struct mapbase_evaluator : evaluator_base<Derived>
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
  }
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_data[index * m_xpr.innerStride()];
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
  }
  
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return m_data[index * m_xpr.innerStride()];
  }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const 
  {
    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
    return internal::ploadt<PacketType, LoadMode>(ptr);
  }

-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const 
  {
    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
  }
  
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x) 
  {
    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
  }
  
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x) 
  {
    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
@ -767,46 +808,54 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
    RowsAtCompileTime = XprType::RowsAtCompileTime
  };
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  { 
    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
  }
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  { 
    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  { 
    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
  }
  
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  { 
    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const 
  { 
    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
  }

-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const 
  { 
    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                       RowsAtCompileTime == 1 ? index : 0);
  }
  
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x) 
  { 
    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
  }
  
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x) 
  { 
    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
@ -859,7 +908,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
  };

-  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
    : m_conditionImpl(select.conditionMatrix()),
      m_thenImpl(select.thenMatrix()),
      m_elseImpl(select.elseMatrix())
@ -869,7 +918,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
 
  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    if (m_conditionImpl.coeff(row, col))
      return m_thenImpl.coeff(row, col);
@ -877,7 +927,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
      return m_elseImpl.coeff(row, col);
  }

-  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    if (m_conditionImpl.coeff(index))
      return m_thenImpl.coeff(index);
@ -921,7 +972,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
      m_cols(replicate.nestedExpression().cols())
  {}
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    // try to avoid using modulo; this is a pure optimization strategy
    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@ -934,7 +986,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
    return m_argImpl.coeff(actual_row, actual_col);
  }
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    // try to avoid using modulo; this is a pure optimization strategy
    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
@ -945,6 +998,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@ -958,6 +1012,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
  }
  
  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
@ -994,7 +1049,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
    
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))),
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
    
    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
  };
@ -1008,7 +1063,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >

  typedef typename XprType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
  {
    if (Direction==Vertical)
      return m_functor(m_arg.col(j));
@ -1016,7 +1072,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
      return m_functor(m_arg.row(i));
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
  {
    if (Direction==Vertical)
      return m_functor(m_arg.col(index));
@ -1051,45 +1108,53 @@ struct evaluator_wrapper_base
  typedef typename ArgType::Scalar Scalar;
  typedef typename ArgType::CoeffReturnType CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_argImpl.coeff(row, col);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_argImpl.coeff(index);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    return m_argImpl.coeffRef(row, col);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return m_argImpl.coeffRef(index);
  }

-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
  }

-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    return m_argImpl.template packet<LoadMode,PacketType>(index);
  }

-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode>(row, col, x);
  }

-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
    m_argImpl.template writePacket<StoreMode>(index, x);
@ -1164,29 +1229,34 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
  { }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
  {
    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
                           ReverseCol ? m_cols.value() - col - 1 : col);
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
  {
    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
                              ReverseCol ? m_cols.value() - col - 1 : col);
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
    enum {
@ -1201,6 +1271,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
    enum { PacketSize = unpacket_traits<PacketType>::size };
@ -1208,6 +1279,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
    // FIXME we could factorize some code with packet(i,j)
@ -1224,6 +1296,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
  }

  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
    enum { PacketSize = unpacket_traits<PacketType>::size };
@ -1267,22 +1340,26 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index) const
  {
    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
  }

-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
  {
    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index)
  {
    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
  }

-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
  {
    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
  }
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -32,8 +32,8 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
  // we still want to handle the case when the result type is different.
  typedef typename result_of<
                     BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
                     )
                   >::type Scalar;
  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@ -19,7 +19,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
 : traits<XprType>
 {
  typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
+                     UnaryOp(const typename XprType::Scalar&)
                   >::type Scalar;
  typedef typename XprType::Nested XprTypeNested;
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
@ -58,33 +58,34 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal

    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
    typedef typename internal::remove_all<XprType>::type NestedExpression;

-    EIGEN_DEVICE_FUNC
-    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
      : m_xpr(xpr), m_functor(func) {}

-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const { return m_xpr.cols(); }

    /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    const UnaryOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
    nestedExpression() const { return m_xpr; }

    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }

  protected:
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
    const UnaryOp m_functor;
 };

--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@ -18,7 +18,7 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
 : traits<MatrixType>
 {
  typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
                   >::type Scalar;
  typedef typename MatrixType::Nested MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
@ -61,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in

    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;

    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
@ -75,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
    const ViewOp& functor() const { return m_functor; }

    /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
+    typename internal::remove_reference<MatrixTypeNested>::type&
    nestedExpression() { return m_matrix.const_cast_derived(); }

  protected:
-    typename internal::ref_selector<MatrixType>::type m_matrix;
+    MatrixTypeNested m_matrix;
    ViewOp m_functor;
 };

--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -275,13 +275,13 @@ template<typename Derived> class DenseBase

    /** Copies \a other into *this. \returns a reference to *this. */
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase<OtherDerived>& other);

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase& other);

    template<typename OtherDerived>
@ -388,10 +388,10 @@ template<typename Derived> class DenseBase
    inline bool hasNaN() const;
    inline bool allFinite() const;

-    EIGEN_DEVICE_FUNC
-    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator/=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);

    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
    /** \returns the matrix or vector obtained by evaluating this expression.
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@ -103,21 +103,21 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                     >::type ScalarWithConstIfNotLvalue;

    EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
    EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }

    EIGEN_DEVICE_FUNC
    inline Scalar& coeffRef(Index row, Index)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index row, Index) const
    {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
    }

    EIGEN_DEVICE_FUNC
@ -130,13 +130,13 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    inline Scalar& coeffRef(Index idx)
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
    }

    EIGEN_DEVICE_FUNC
    inline const Scalar& coeffRef(Index idx) const
    {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
    }

    EIGEN_DEVICE_FUNC
@ -159,7 +159,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    }

  protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
    const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;

  private:
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -142,6 +142,52 @@ inline void MatrixBase<Derived>::normalize()
    derived() /= numext::sqrt(z);
 }

+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+inline void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
+}
+
 //---------- implementation of other norms ----------

 namespace internal {
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -75,6 +75,7 @@ struct default_packet_traits
    HasCosh    = 0,
    HasTanh    = 0,
    HasLGamma = 0,
+    HasDiGamma = 0,
    HasErf = 0,
    HasErfc = 0,

@ -284,7 +285,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 { pstore(to, from); }

 /** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> inline void prefetch(const Scalar* addr)
+template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
 #ifdef __CUDA_ARCH__
 #if defined(__LP64__)
@ -439,6 +440,10 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }

+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
 /** \internal \returns the erf(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perf(const Packet& a) { using numext::erf; return erf(a); }
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -50,6 +50,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -748,9 +748,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
 }

 //MSVC defines a _isnan builtin function, but for double only
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); }
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x); }
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }

 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
@ -1080,21 +1080,21 @@ struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::

 template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }

 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }

 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -135,14 +135,14 @@ template<typename Derived> class MatrixBase
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
      */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const MatrixBase& other);

    // We cannot inherit here via Base::operator= since it is causing
    // trouble with MSVC.

    template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator=(const DenseBase<OtherDerived>& other);

    template <typename OtherDerived>
@ -154,10 +154,10 @@ template<typename Derived> class MatrixBase
    Derived& operator=(const ReturnByValue<OtherDerived>& other);

    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator+=(const MatrixBase<OtherDerived>& other);
    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const MatrixBase<OtherDerived>& other);

 #ifdef __CUDACC__
@ -204,7 +204,9 @@ template<typename Derived> class MatrixBase
    RealScalar blueNorm() const;
    RealScalar hypotNorm() const;
    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
    EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();

    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
    EIGEN_DEVICE_FUNC void adjointInPlace();
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -32,7 +32,7 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef MatrixType ExpressionType;
  typedef typename MatrixType::PlainObject FullMatrixType;
@ -97,7 +97,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    {
      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
      Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return m_matrix.coeffRef(row, col);
    }

    /** \internal */
@ -107,7 +107,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
    EIGEN_DEVICE_FUNC
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }

    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@ -13,79 +13,349 @@
 namespace Eigen {
 namespace internal {

+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar run(const Scalar x, const Scalar coef[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+  }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar run(const Scalar, const Scalar coef[]) {
+    return coef[0];
+  }
+};
+
+}  // end namespace cephes
+
 /****************************************************************************
 * Implementation of lgamma                                                 *
 ****************************************************************************/

-template<typename Scalar>
-struct lgamma_impl
-{
+template <typename Scalar>
+struct lgamma_impl {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                        THIS_TYPE_IS_NOT_SUPPORTED);
    return Scalar(0);
  }
 };

-template<typename Scalar>
-struct lgamma_retval
-{
+template <typename Scalar>
+struct lgamma_retval {
  typedef Scalar type;
 };

 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct lgamma_impl<float>
-{
+template <>
+struct lgamma_impl<float> {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); }
+  static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); }
 };

-template<>
-struct lgamma_impl<double>
-{
+template <>
+struct lgamma_impl<double> {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); }
+  static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); }
 };
 #endif

+/****************************************************************************
+ * Implementation of digamma (psi)                                          *
+ ****************************************************************************/
+
+#ifdef EIGEN_HAS_C99_MATH
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float s) {
+    const float A[] = {
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * cephes::polevl<float, 3>::run(z, A);
+    } else return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double s) {
+    const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * cephes::polevl<double, 6>::run(z, A);
+    }
+    else return 0.0;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative;
+
+    const Scalar maxnum = std::numeric_limits<Scalar>::infinity();
+    const Scalar m_pi = 3.14159265358979323846;
+
+    negative = 0;
+    nz = 0.0;
+
+    const Scalar zero = 0.0;
+    const Scalar one = 1.0;
+    const Scalar half = 0.5;
+
+    if (x <= zero) {
+      negative = one;
+      q = x;
+      p = ::floor(q);
+      if (p == q) {
+        return maxnum;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / ::tan(m_pi * nz);
+      }
+      else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = ::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
 /****************************************************************************
 * Implementation of erf                                                    *
 ****************************************************************************/

-template<typename Scalar>
-struct erf_impl
-{
+template <typename Scalar>
+struct erf_impl {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                        THIS_TYPE_IS_NOT_SUPPORTED);
    return Scalar(0);
  }
 };

-template<typename Scalar>
-struct erf_retval
-{
+template <typename Scalar>
+struct erf_retval {
  typedef Scalar type;
 };

 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct erf_impl<float>
-{
+template <>
+struct erf_impl<float> {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); }
+  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
 };

-template<>
-struct erf_impl<double>
-{
+template <>
+struct erf_impl<double> {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); }
+  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
 };
 #endif  // EIGEN_HAS_C99_MATH

@ -93,35 +363,30 @@ struct erf_impl<double>
 * Implementation of erfc                                                   *
 ****************************************************************************/

-template<typename Scalar>
-struct erfc_impl
-{
+template <typename Scalar>
+struct erfc_impl {
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                        THIS_TYPE_IS_NOT_SUPPORTED);
    return Scalar(0);
  }
 };

-template<typename Scalar>
-struct erfc_retval
-{
+template <typename Scalar>
+struct erfc_retval {
  typedef Scalar type;
 };

 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct erfc_impl<float>
-{
+template <>
+struct erfc_impl<float> {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
 };

-template<>
-struct erfc_impl<double>
-{
+template <>
+struct erfc_impl<double> {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
 };
@ -129,27 +394,29 @@ struct erfc_impl<double>

 }  // end namespace internal

-
 namespace numext {

-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+    lgamma(const Scalar& x) {
  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
 }

-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+    digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+    erf(const Scalar& x) {
  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
 }

-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+    erfc(const Scalar& x) {
  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }

--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -54,6 +54,8 @@ template<typename MatrixType> class Transpose
 {
  public:

+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
@ -68,16 +70,16 @@ template<typename MatrixType> class Transpose

    /** \returns the nested expression */
    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
    nestedExpression() const { return m_matrix; }

    /** \returns the nested expression */
    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }

  protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
 };

 namespace internal {
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@ -325,7 +325,7 @@ class TranspositionsWrapper

  protected:

-    const typename IndicesType::Nested m_indices;
+    typename IndicesType::Nested m_indices;
 };


--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@ -168,7 +168,7 @@ namespace internal {
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
  typedef typename MatrixType::PlainObject FullMatrixType;
@ -213,7 +213,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
      IsVectorAtCompileTime = false
    };

-    // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole
    EIGEN_DEVICE_FUNC
    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
    {}
@ -235,7 +234,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView

    /** \returns a reference to the nested expression */
    EIGEN_DEVICE_FUNC
-    NestedExpression& nestedExpression() { return *const_cast<NestedExpression*>(&m_matrix); }
+    NestedExpression& nestedExpression() { return m_matrix; }
    
    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
    /** \sa MatrixBase::conjugate() const */
@ -255,7 +254,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    inline TransposeReturnType transpose()
    {
      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived());
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
      return TransposeReturnType(tmp);
    }
    
@ -418,7 +417,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    {
      EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
      Base::check_coordinates_internal(row, col);
-      return derived().nestedExpression().const_cast_derived().coeffRef(row, col);
+      return derived().nestedExpression().coeffRef(row, col);
    }

    /** Assigns a triangular matrix to a triangular part of a dense matrix */
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -124,7 +124,7 @@ struct member_lpnorm {
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
  typedef typename result_of<
-                     BinaryOp(Scalar,Scalar)
+                     BinaryOp(const Scalar&,const Scalar&)
                   >::type  result_type;
  template<typename _Scalar, int Size> struct Cost
  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@ -197,7 +197,7 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 /** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN.
  *
-  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
  */
 template<typename Derived>
 template<typename IndexType>
@ -215,7 +215,7 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
  * \warning the result is undefined if \c *this contains NaN. 
  *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
  */
 template<typename Derived>
 template<typename IndexType>
@ -233,7 +233,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 /** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
  * \warning the result is undefined if \c *this contains NaN. 
  *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
  */
 template<typename Derived>
 template<typename IndexType>
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@ -78,6 +78,20 @@ double2 plgamma<double2>(const double2& a)
  return make_double2(lgamma(a.x), lgamma(a.y));
 }

+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 perf<float4>(const float4& a)
 {
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@ -40,6 +40,7 @@ template<> struct packet_traits<float> : default_packet_traits
    HasSqrt = 1,
    HasRsqrt = 1,
    HasLGamma = 1,
+    HasDiGamma = 1,
    HasErf = 1,
    HasErfc = 1,

@ -63,6 +64,7 @@ template<> struct packet_traits<double> : default_packet_traits
    HasSqrt = 1,
    HasRsqrt = 1,
    HasLGamma = 1,
+    HasDiGamma = 1,
    HasErf = 1,
    HasErfc = 1,

--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@ -37,7 +37,7 @@ template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };

-template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_op_impl;
+template <typename Scalar, typename Packet, bool RandomAccess, bool IsInteger> struct linspaced_op_impl;

 // linear access for packet ops:
 // 1) initialization
@ -48,12 +48,12 @@ template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_
 // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
 //       in order to avoid the padd() in operator() ?
 template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,false>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+    m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)),
+    m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}

  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
@ -75,11 +75,11 @@ struct linspaced_op_impl<Scalar,Packet,false>
 // 1) each step
 //   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
 template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,true>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+    m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}

  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
@ -95,6 +95,31 @@ struct linspaced_op_impl<Scalar,Packet,true>
  const Packet m_interPacket;
 };

+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset<Packet>(0))
+  {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar operator() (Index i) const {
+    return m_low + (m_length*Scalar(i))/m_divisor;
+  }
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Packet packetOp(Index i) const {
+    return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)),
+                                                     pset1<Packet>(m_divisor))); }
+
+  const Scalar m_low;
+  const Scalar m_length;
+  const Index  m_divisor;
+  const Packet m_interPacket;
+};
+
 // ----- Linspace functor ----------------------------------------------------------------

 // Forward declaration (we default to random access which does not really give
@ -102,10 +127,20 @@ struct linspaced_op_impl<Scalar,Packet,true>
 // nested expressions).
 template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op;
 template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
+{
+  enum
+  {
+    Cost = 1,
+    PacketAccess =   packet_traits<Scalar>::HasSetLinear
+                  && ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),
+    IsRepeatable = true
+  };
+};
 template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op
 {
-  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
+  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+    : impl((num_steps==1 ? high : low),high,num_steps)
+  {}

  template<typename Index>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
@ -134,7 +169,9 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
  // This proxy object handles the actual required temporaries, the different
  // implementations (random vs. sequential access) as well as the
  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl;
+  // As long as we don't have a Bresenham-like implementation for linear-access and integer types,
+  // we have to by-pass RandomAccess for integer types. See bug 698.
+  const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl;
 };

 // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -427,6 +427,28 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
  };
 };

+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
 /** \internal
 * \brief Template functor to compute the Gauss error function of a
 * scalar
@ -644,7 +666,7 @@ struct functor_traits<scalar_floor_op<Scalar> >
 template<typename Scalar> struct scalar_ceil_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
 };
 template<typename Scalar>
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -252,7 +252,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
        // we have both L2 and L3, and problem is small enough to be kept in L2
        // Let's choose m such that lhs's block fit in 1/3 of L2
        actual_lm = l2;
-        max_mc = 576;
+        max_mc = (std::min<Index>)(576,max_mc);
      }
      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
      if (mc > Traits::mr) mc -= mc % Traits::mr;
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -352,9 +352,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
      }
      else  // no l3 blocking
      {
-        Index m = this->m_mc;
        Index n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
      }

      m_sizeA = this->m_mc * this->m_kc;
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@ -42,13 +42,14 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
  {
    general_matrix_matrix_triangular_product<Index,
        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
        ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
  }
 };

@ -58,7 +59,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
  {
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;

@ -69,16 +71,18 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    RhsMapper rhs(_rhs,rhsStride);
    ResMapper res(_res, resStride);

-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
+    Index kc = blocking.kc();
+    Index mc = (std::min)(size,blocking.mc());
+
    // !!! mc must be a multiple of nr:
    if(mc > Traits::nr)
      mc = (mc/Traits::nr)*Traits::nr;

-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());

    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
@ -136,7 +140,7 @@ struct tribb_kernel
  typedef typename Traits::ResScalar ResScalar;

  enum {
-    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
  };
  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
  {
@ -256,13 +260,27 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>

    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());

+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
    internal::general_matrix_matrix_triangular_product<Index,
-      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualLhs.cols(),
+      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
  }
 };

--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@ -291,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
    const Scalar* lhs, Index lhsStride,
    const Scalar* rhs, Index rhsStride,
    Scalar* res,       Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    product_selfadjoint_matrix<Scalar, Index,
      EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
@ -299,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
      ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha, blocking);
  }
 };

@ -314,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index,
@ -325,7 +325,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
    Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = rows;

@ -340,17 +340,14 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    RhsMapper rhs(_rhs,rhsStride);
    ResMapper res(_res, resStride);

-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
-    // kc must smaller than mc
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // kc must be smaller than mc
    kc = (std::min)(kc,mc);
-
+    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());

    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@ -410,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index,
@ -421,7 +418,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
    Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = cols;

@ -432,14 +429,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    LhsMapper lhs(_lhs,lhsStride);
    ResMapper res(_res,resStride);

-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());

    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@ -498,6 +493,11 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
                               * RhsBlasTraits::extractScalarFactor(a_rhs);

+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
+
    internal::product_selfadjoint_matrix<Scalar, Index,
      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
@ -509,7 +509,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
        &dst.coeffRef(0,0), dst.outerStride(),  // result info
-        actualAlpha                             // alpha
+        actualAlpha, blocking                   // alpha
      );
  }
 };
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@ -92,15 +92,27 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>

    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());

-    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+              MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+

    internal::general_matrix_matrix_triangular_product<Index,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualOther.cols(),
+      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
  }
 };

--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@ -126,6 +126,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,

    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // The small panel size must not be larger than blocking size.
+    // Usually this should never be the case because SmallPanelWidth^2 is very small
+    // compared to L2 cache size, but let's be safe:
+    Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));

    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
@ -169,9 +173,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
      if(IsLower || actual_k2<rows)
      {
        // for each small vertical panels of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        for (Index k1=0; k1<actual_kc; k1+=panelWidth)
        {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
          Index startBlock   = actual_k2+k1;
          Index blockBOffset = k1;
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -15,10 +15,11 @@
  // 4522 - 'class' : multiple assignment operators specified
  // 4700 - uninitialized local variable 'xyz' used
  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 )
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -336,7 +336,6 @@
 // Do we support r-value references?
 #if (__has_feature(cxx_rvalue_references) || \
    (defined(__cplusplus) && __cplusplus >= 201103L) || \
-     defined(__GXX_EXPERIMENTAL_CXX0X__) || \
    (EIGEN_COMP_MSVC >= 1600))
  #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -526,9 +526,9 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
 template<int Alignment, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
-  static const Index ScalarSize = sizeof(Scalar);
-  static const Index AlignmentSize = Alignment / ScalarSize;
-  static const Index AlignmentMask = AlignmentSize-1;
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize-1;

  if(AlignmentSize<=1)
  {
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -257,7 +257,7 @@ struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};

 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef ArgType type;};
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};

 template<typename Func, typename ArgType>
 struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
@ -279,7 +279,7 @@ struct result_of<Func(ArgType)> {
 };

 template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef ArgType0 type;};
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};

 template<typename Func, typename ArgType0, typename ArgType1>
 struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
@ -326,6 +326,22 @@ class meta_sqrt
 template<int Y, int InfX, int SupX>
 class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };

+
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. It implements a naive algorithm testing all multiples of A.
+  * It thus works better if A>=B.
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+struct meta_least_common_multiple
+{
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
+};
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true>
+{
+  enum { ret = A*K };
+};
+
 /** \internal determines whether the product of two numeric types is allowed and what the return type is */
 template<typename T, typename U> struct scalar_product_traits
 {
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@ -26,7 +26,7 @@

 #ifndef EIGEN_NO_STATIC_ASSERT

-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600)
+  #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)

    // if native static_assert is enabled, let's use it
    #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -466,17 +466,17 @@ struct special_scalar_op_base : public BaseType
 template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
 struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
 {
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
  operator*(const OtherScalar& scalar) const
  {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
 #endif
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
  }

-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
  operator*(const OtherScalar& scalar, const Derived& matrix)
  {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
@ -485,13 +485,13 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : publi
    return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
  }
  
-  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
  operator/(const OtherScalar& scalar) const
  {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
 #endif
-    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
  }
 };
@ -526,22 +526,21 @@ template <typename A> struct promote_storage_type<const A, A>
  * the functor.
  * The default rules are as follows:
  * \code
-  * A     op A      -> A
-  * A     op dense  -> dense
-  * dense op B      -> dense
-  * A     *  dense  -> A
-  * dense *  B      -> B
+  * A      op A      -> A
+  * A      op dense  -> dense
+  * dense  op B      -> dense
+  * sparse op dense  -> sparse
+  * dense  op sparse -> sparse
  * \endcode
  */
 template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;

-template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A     ret; };
-template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense ret; };
-template <typename ScalarA, typename ScalarB>             struct cwise_promote_storage_type<Dense,Dense,scalar_product_op<ScalarA,ScalarB> >  { typedef Dense ret; };
-template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense ret; };
-template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense ret; };
-template <typename A, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<A,Dense,scalar_product_op<ScalarA,ScalarB> >      { typedef A     ret; };
-template <typename B, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<Dense,B,scalar_product_op<ScalarA,ScalarB> >      { typedef B     ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A      ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense  ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense  ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense  ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };

 /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h
@ -129,7 +129,7 @@ public:
    * determined by \a prec.
    *
    * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  bool isApprox(const ParametrizedLine& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }

 protected:
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@ -162,7 +162,7 @@ public:
    * determined by \a prec.
    *
    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  bool isApprox(const Translation& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
  { return m_coeffs.isApprox(other.m_coeffs, prec); }

 };
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@ -37,6 +37,8 @@ namespace Eigen {
  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by
+  * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
  *
  */
 template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
@ -185,6 +187,10 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
 }; 

+// Based on the following paper:
+//   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+//   Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+//   http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf
 template<typename Scalar, int _UpLo, typename OrderingType>
 template<typename _MatrixType>
 void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
@ -240,7 +246,7 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
    else
      m_scale(j) = 1;

-  // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
  
  // Scale and compute the shift for the matrix 
  RealScalar mindiag = NumTraits<RealScalar>::highest();
@ -251,96 +257,122 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
    eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
    mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
  }
+
+  FactorType L_save = m_L;
  
  RealScalar shift = 0;
  if(mindiag <= RealScalar(0.))
    shift = m_initialShift - mindiag;

-  // Apply the shift to the diagonal elements of the matrix
-  for (Index j = 0; j < n; j++)
-    vals[colPtr[j]] += shift;
-  
-  // jki version of the Cholesky factorization 
-  for (Index j=0; j < n; ++j)
-  {  
-    // Left-looking factorization of the j-th column
-    // First, load the j-th column into col_vals 
-    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
-    col_nnz = 0;
-    for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
+  m_info = NumericalIssue;
+
+  // Try to perform the incomplete factorization using the current shift
+  int iter = 0;
+  do
+  {
+    // Apply the shift to the diagonal elements of the matrix
+    for (Index j = 0; j < n; j++)
+      vals[colPtr[j]] += shift;
+
+    // jki version of the Cholesky factorization
+    Index j=0;
+    for (; j < n; ++j)
    {
-      StorageIndex l = rowIdx[i];
-      col_vals(col_nnz) = vals[i];
-      col_irow(col_nnz) = l;
-      col_pattern(l) = col_nnz;
-      col_nnz++;
-    }
-    {
-      typename std::list<StorageIndex>::iterator k; 
-      // Browse all previous columns that will update column j
-      for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
+      // Left-looking factorization of the j-th column
+      // First, load the j-th column into col_vals
+      Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+      col_nnz = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
      {
-        Index jk = firstElt(*k); // First element to use in the column 
-        eigen_internal_assert(rowIdx[jk]==j);
-        Scalar v_j_jk = numext::conj(vals[jk]);
-        
-        jk += 1; 
-        for (Index i = jk; i < colPtr[*k+1]; i++)
-        {
-          StorageIndex l = rowIdx[i];
-          if(col_pattern[l]<0)
-          {
-            col_vals(col_nnz) = vals[i] * v_j_jk;
-            col_irow[col_nnz] = l;
-            col_pattern(l) = col_nnz;
-            col_nnz++;
-          }
-          else
-            col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
-        }
-        updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+        StorageIndex l = rowIdx[i];
+        col_vals(col_nnz) = vals[i];
+        col_irow(col_nnz) = l;
+        col_pattern(l) = col_nnz;
+        col_nnz++;
      }
+      {
+        typename std::list<StorageIndex>::iterator k;
+        // Browse all previous columns that will update column j
+        for(k = listCol[j].begin(); k != listCol[j].end(); k++)
+        {
+          Index jk = firstElt(*k); // First element to use in the column
+          eigen_internal_assert(rowIdx[jk]==j);
+          Scalar v_j_jk = numext::conj(vals[jk]);
+
+          jk += 1;
+          for (Index i = jk; i < colPtr[*k+1]; i++)
+          {
+            StorageIndex l = rowIdx[i];
+            if(col_pattern[l]<0)
+            {
+              col_vals(col_nnz) = vals[i] * v_j_jk;
+              col_irow[col_nnz] = l;
+              col_pattern(l) = col_nnz;
+              col_nnz++;
+            }
+            else
+              col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+          }
+          updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+        }
+      }
+
+      // Scale the current column
+      if(numext::real(diag) <= 0)
+      {
+        if(++iter>=10)
+          return;
+
+        // increase shift
+        shift = numext::maxi(m_initialShift,RealScalar(2)*shift);
+        // restore m_L, col_pattern, and listCol
+        vals = Map<const VectorSx>(L_save.valuePtr(), nnz);
+        rowIdx = Map<const VectorIx>(L_save.innerIndexPtr(), nnz);
+        colPtr = Map<const VectorIx>(L_save.outerIndexPtr(), n+1);
+        col_pattern.fill(-1);
+        for(Index i=0; i<n; ++i)
+          listCol[i].clear();
+
+        break;
+      }
+
+      RealScalar rdiag = sqrt(numext::real(diag));
+      vals[colPtr[j]] = rdiag;
+      for (Index k = 0; k<col_nnz; ++k)
+      {
+        Index i = col_irow[k];
+        //Scale
+        col_vals(k) /= rdiag;
+        //Update the remaining diagonals with col_vals
+        vals[colPtr[i]] -= numext::abs2(col_vals(k));
+      }
+      // Select the largest p elements
+      // p is the original number of elements in the column (without the diagonal)
+      Index p = colPtr[j+1] - colPtr[j] - 1 ;
+      Ref<VectorSx> cvals = col_vals.head(col_nnz);
+      Ref<VectorIx> cirow = col_irow.head(col_nnz);
+      internal::QuickSplit(cvals,cirow, p);
+      // Insert the largest p elements in the matrix
+      Index cpt = 0;
+      for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+      {
+        vals[i] = col_vals(cpt);
+        rowIdx[i] = col_irow(cpt);
+        // restore col_pattern:
+        col_pattern(col_irow(cpt)) = -1;
+        cpt++;
+      }
+      // Get the first smallest row index and put it after the diagonal element
+      Index jk = colPtr(j)+1;
+      updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol);
    }
-    
-    // Scale the current column
-    if(numext::real(diag) <= 0) 
+
+    if(j==n)
    {
-      m_info = NumericalIssue; 
-      return; 
+      m_factorizationIsOk = true;
+      m_info = Success;
    }
-    
-    RealScalar rdiag = sqrt(numext::real(diag));
-    vals[colPtr[j]] = rdiag;
-    for (Index k = 0; k<col_nnz; ++k)
-    {
-      Index i = col_irow[k];
-      //Scale
-      col_vals(k) /= rdiag;
-      //Update the remaining diagonals with col_vals
-      vals[colPtr[i]] -= numext::abs2(col_vals(k));
-    }
-    // Select the largest p elements
-    // p is the original number of elements in the column (without the diagonal)
-    Index p = colPtr[j+1] - colPtr[j] - 1 ; 
-    Ref<VectorSx> cvals = col_vals.head(col_nnz);
-    Ref<VectorIx> cirow = col_irow.head(col_nnz);
-    internal::QuickSplit(cvals,cirow, p); 
-    // Insert the largest p elements in the matrix
-    Index cpt = 0; 
-    for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
-    {
-      vals[i] = col_vals(cpt); 
-      rowIdx[i] = col_irow(cpt);
-      // restore col_pattern:
-      col_pattern(col_irow(cpt)) = -1;
-      cpt++; 
-    }
-    // Get the first smallest row index and put it after the diagonal element
-    Index jk = colPtr(j)+1;
-    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
-  }
-  m_factorizationIsOk = true; 
-  m_info = Success;
+  } while(m_info!=Success);
 }

 template<typename Scalar, int _UpLo, typename OrderingType>
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@ -8,7 +8,7 @@
 NOTE: this routine has been adapted from the CSparse library:

 Copyright (c) 2006, Timothy A. Davis.
-http://www.cise.ufl.edu/research/sparse/CSparse
+http://www.suitesparse.com

 CSparse is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@ -41,12 +41,8 @@
 // 
 //   The colamd/symamd library is available at
 // 
-//       http://www.cise.ufl.edu/research/sparse/colamd/
+//       http://www.suitesparse.com

-//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-//   files, and by any C code that calls the routines whose prototypes are
-//   listed below, or that uses the colamd/symamd definitions listed below.
  
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
@ -102,9 +98,6 @@ namespace internal {
 /* === Definitions ========================================================== */
 /* ========================================================================== */

-#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
-
 #define ONES_COMPLEMENT(r) (-(r)-1)

 /* -------------------------------------------------------------------------- */
@ -739,8 +732,8 @@ static void init_scoring

  /* === Extract knobs ==================================================== */

-  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
  COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
  max_deg = 0 ;
  n_col2 = n_col ;
@ -804,7 +797,7 @@ static void init_scoring
    else
    {
      /* keep track of max degree of remaining rows */
-      max_deg = COLAMD_MAX (max_deg, deg) ;
+      max_deg = numext::maxi(max_deg, deg) ;
    }
  }
  COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
@ -842,7 +835,7 @@ static void init_scoring
      /* add row's external degree */
      score += Row [row].shared1.degree - 1 ;
      /* guard against integer overflow */
-      score = COLAMD_MIN (score, n_col) ;
+      score = numext::mini(score, n_col) ;
    }
    /* determine pruned column length */
    col_length = (IndexType) (new_cp - &A [Col [c].start]) ;
@ -914,7 +907,7 @@ static void init_scoring
      head [score] = c ;

      /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, score) ;
+      min_score = numext::mini(min_score, score) ;


    }
@ -1040,7 +1033,7 @@ static IndexType find_ordering /* return the number of garbage collections */

    /* === Garbage_collection, if necessary ============================= */

-    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
+    needed_memory = numext::mini(pivot_col_score, n_col - k) ;
    if (pfree + needed_memory >= Alen)
    {
      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
@ -1099,7 +1092,7 @@ static IndexType find_ordering /* return the number of garbage collections */

    /* clear tag on pivot column */
    Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
+    max_deg = numext::maxi(max_deg, pivot_row_degree) ;


    /* === Kill all rows used to construct pivot row ==================== */
@ -1273,7 +1266,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	/* add set difference */
 	cur_score += row_mark - tag_mark ;
 	/* integer overflow... */
-	cur_score = COLAMD_MIN (cur_score, n_col) ;
+	cur_score = numext::mini(cur_score, n_col) ;
      }

      /* recompute the column's length */
@ -1386,7 +1379,7 @@ static IndexType find_ordering /* return the number of garbage collections */
      cur_score -= Col [col].shared1.thickness ;

      /* make sure score is less or equal than the max score */
-      cur_score = COLAMD_MIN (cur_score, max_score) ;
+      cur_score = numext::mini(cur_score, max_score) ;
      COLAMD_ASSERT (cur_score >= 0) ;

      /* store updated score */
@ -1409,7 +1402,7 @@ static IndexType find_ordering /* return the number of garbage collections */
      head [cur_score] = col ;

      /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, cur_score) ;
+      min_score = numext::mini(min_score, cur_score) ;

    }

--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -100,11 +100,11 @@ protected:
    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
 public:

-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index i)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i)
      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
    {}

-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
    {}

@ -112,7 +112,7 @@ public:
    inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
    {
      typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
-      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
+      _NestedMatrixType& matrix = m_matrix;
      // This assignment is slow if this vector set is not empty
      // and/or it is not at the end of the nonzeros of the underlying matrix.

@ -209,28 +209,28 @@ public:
    inline const Scalar* valuePtr() const
    { return m_matrix.valuePtr(); }
    inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr(); }
+    { return m_matrix.valuePtr(); }

    inline const StorageIndex* innerIndexPtr() const
    { return m_matrix.innerIndexPtr(); }
    inline StorageIndex* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr(); }
+    { return m_matrix.innerIndexPtr(); }

    inline const StorageIndex* outerIndexPtr() const
    { return m_matrix.outerIndexPtr() + m_outerStart; }
    inline StorageIndex* outerIndexPtr()
-    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
    
    inline const StorageIndex* innerNonZeroPtr() const
    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
    inline StorageIndex* innerNonZeroPtr()
-    { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
    
    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
    
    inline Scalar& coeffRef(Index row, Index col)
    {
-      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+      return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
    }
    
    inline const Scalar coeff(Index row, Index col) const
@ -264,7 +264,7 @@ public:

  protected:

-    typename SparseMatrixType::Nested m_matrix;
+    typename internal::ref_selector<SparseMatrixType>::non_const_type m_matrix;
    Index m_outerStart;
    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;

@ -373,7 +373,7 @@ public:

    /** Column or Row constructor
      */
-    inline BlockImpl(const XprType& xpr, Index i)
+    inline BlockImpl(XprType& xpr, Index i)
      : m_matrix(xpr),
        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0),
        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0),
@ -383,7 +383,7 @@ public:

    /** Dynamic-size constructor
      */
-    inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
      : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols))
    {}

@ -392,8 +392,7 @@ public:

    inline Scalar& coeffRef(Index row, Index col)
    {
-      return m_matrix.const_cast_derived()
-               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value());
    }

    inline const Scalar coeff(Index row, Index col) const
@ -403,16 +402,14 @@ public:

    inline Scalar& coeffRef(Index index)
    {
-      return m_matrix.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }

    inline const Scalar coeff(Index index) const
    {
-      return m_matrix
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
    }
    
    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
@ -430,7 +427,7 @@ public:
    
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)

-    typename XprType::Nested m_matrix;
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@ -117,6 +117,24 @@ template<typename Derived>
 class SparseCompressedBase<Derived>::InnerIterator
 {
  public:
+    InnerIterator()
+      : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0)
+    {}
+
+    InnerIterator(const InnerIterator& other)
+      : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end)
+    {}
+
+    InnerIterator& operator=(const InnerIterator& other)
+    {
+      m_values = other.m_values;
+      m_indices = other.m_indices;
+      const_cast<OuterType&>(m_outer).setValue(other.m_outer.value());
+      m_id = other.m_id;
+      m_end = other.m_end;
+      return *this;
+    }
+
    InnerIterator(const SparseCompressedBase& mat, Index outer)
      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
    {
@ -162,7 +180,8 @@ class SparseCompressedBase<Derived>::InnerIterator
  protected:
    const Scalar* m_values;
    const StorageIndex* m_indices;
-    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
    Index m_id;
    Index m_end;
  private:
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@ -49,17 +49,10 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>

 namespace internal {

-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
-
-namespace internal {
-
  
 // Generic "sparse OP sparse"
+template<typename XprType> struct binary_sparse_evaluator;
+
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@ -153,6 +146,182 @@ protected:
  evaluator<Rhs> m_rhsImpl;
 };

+// dense op sparse
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor?m_rhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_rhsIter.outer());
+        if(m_rhsIter && m_rhsIter.index()==m_id)
+        {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        }
+        else
+          m_value = m_functor(lhsVal, Scalar(0));
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    const evaluator<Lhs> &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+// sparse op dense
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor?m_lhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_lhsIter.outer());
+        if(m_lhsIter && m_lhsIter.index()==m_id)
+        {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        }
+        else
+          m_value = m_functor(Scalar(0),rhsVal);
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs> &m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
 // "sparse .* sparse"
 template<typename T, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IteratorBased>
@ -287,7 +456,8 @@ public:
  
  enum {
    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
  };
  
  explicit binary_evaluator(const XprType& xpr)
@ -360,7 +530,8 @@ public:
  
  enum {
    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
  };
  
  explicit binary_evaluator(const XprType& xpr)
@ -428,6 +599,34 @@ SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) c
  return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
 }

+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator+(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator+(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator-(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator-(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
 } // end namespace Eigen

 #endif // EIGEN_SPARSE_CWISE_BINARY_OP_H
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@ -48,7 +48,7 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
      // It basically represents the minimal amount of work to be done to be worth it.
      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
      {
-        #pragma omp parallel for schedule(static) num_threads(threads)
+        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
        for(Index i=0; i<n; ++i)
          processRow(lhsEval,rhs,res,alpha,i,c);
      }
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@ -538,7 +538,12 @@ class SparseMatrix
    }

    /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa reserve(), setZero()
+      *
+      * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode
+      * and the storage of the out of bounds coefficients is kept and reserved.
+      * Call makeCompressed() to pack the entries and squeeze extra memory.
+      *
+      * \sa reserve(), setZero(), makeCompressed()
      */
    void conservativeResize(Index rows, Index cols) 
    {
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@ -55,10 +55,10 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    typedef typename MatrixType::Scalar Scalar;
    typedef typename MatrixType::StorageIndex StorageIndex;
    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
    
-    explicit inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
    {
      eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
    }
@ -68,7 +68,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView

    /** \internal \returns a reference to the nested matrix */
    const _MatrixTypeNested& matrix() const { return m_matrix; }
-    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type& matrix() { return m_matrix; }

    /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
      *
@ -158,7 +158,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
    
  protected:

-    typename MatrixType::Nested m_matrix;
+    MatrixTypeNested m_matrix;
    //mutable VectorI m_countPerRow;
    //mutable VectorI m_countPerCol;
  private:
@ -194,9 +194,9 @@ SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<Derive
 {
  SparseMatrix<Scalar,(MatrixType::Flags&RowMajorBit)?RowMajor:ColMajor> tmp = u * u.adjoint();
  if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<Mode>();
+    m_matrix = tmp.template triangularView<Mode>();
  else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<Mode>();
+    m_matrix += alpha * tmp.template triangularView<Mode>();

  return *this;
 }
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@ -205,23 +205,54 @@ class SparseVector

    inline void finalize() {}

+    /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
    {
      m_data.prune(reference,epsilon);
    }

+    /** Resizes the sparse vector to \a rows x \a cols
+      *
+      * This method is provided for compatibility with matrices.
+      * For a column vector, \a cols must be equal to 1.
+      * For a row vector, \a rows must be equal to 1.
+      *
+      * \sa resize(Index)
+      */
    void resize(Index rows, Index cols)
    {
      eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1");
      resize(IsColVector ? rows : cols);
    }

+    /** Resizes the sparse vector to \a newSize
+      * This method deletes all entries, thus leaving an empty sparse vector
+      *
+      * \sa  conservativeResize(), setZero() */
    void resize(Index newSize)
    {
      m_size = newSize;
      m_data.clear();
    }

+    /** Resizes the sparse vector to \a newSize, while leaving old values untouched.
+      *
+      * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved.
+      * Call .data().squeeze() to free extra memory.
+      *
+      * \sa reserve(), setZero()
+      */
+    void conservativeResize(Index newSize)
+    {
+      if (newSize < m_size)
+      {
+        Index i = 0;
+        while (i<m_data.size() && m_data.index(i)<newSize) ++i;
+        m_data.resize(i);
+      }
+      m_size = newSize;
+    }
+
    void resizeNonZeros(Index size) { m_data.resize(size); }

    inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@ -38,7 +38,7 @@ public:
  typedef typename internal::remove_all<MatrixType>::type NestedExpression;

  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
-                      RealScalar epsilon = NumTraits<Scalar>::dummy_precision())
+                      const RealScalar &epsilon = NumTraits<Scalar>::dummy_precision())
    : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}

  inline Index rows() const { return m_matrix.rows(); }
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@ -128,6 +128,17 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
    inline Index cols() const { return m_pmat.cols();}
    
    /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+      * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
+      *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
+      *          and coefficient-wise operations. Matrix products and triangular solves are fine though.
+      *
+      * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix
+      * is required, you can copy it again:
+      * \code
+      * SparseMatrix<double>          R  = qr.matrixR();  // column-major, not sorted!
+      * SparseMatrix<double,RowMajor> Rr = qr.matrixR();  // row-major, sorted
+      * SparseMatrix<double>          Rc = Rr;            // column-major, sorted
+      * \endcode
      */
    const QRMatrixType& matrixR() const { return m_R; }
    
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@ -22,6 +22,7 @@ typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturn
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> PowReturnType;
@ -318,6 +319,16 @@ lgamma() const
  return LgammaReturnType(derived());
 }

+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise Gauss error
 * function of *this.
 *
--- a/bench/tensors/README
+++ b/bench/tensors/README
@ -0,0 +1,8 @@
+Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+
+To compile the CPU benchmarks, simply call:
+g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the GPU benchmarks, simply call:
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu
+
--- a/bench/tensors/benchmark.h
+++ b/bench/tensors/benchmark.h
@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+  Benchmark(const char* name, void (*fn)(int)) {
+    Register(name, fn, NULL);
+  }
+  Benchmark(const char* name, void (*fn_range)(int, int)) {
+    Register(name, NULL, fn_range);
+  }
+  Benchmark* Arg(int x);
+  Benchmark* Range(int lo, int hi);
+  const char* Name();
+  bool ShouldRun(int argc, char* argv[]);
+  void Run();
+ private:
+  const char* name_;
+  void (*fn_)(int);
+  void (*fn_range_)(int, int);
+  std::vector<int> args_;
+  void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+  void RunRepeatedlyWithArg(int iterations, int arg);
+  void RunWithArg(int arg);
+};
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+    static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+        (new ::testing::Benchmark(#f, f))
--- a/bench/tensors/benchmark_main.cc
+++ b/bench/tensors/benchmark_main.cc
@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_flops_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+
+BenchmarkMap& gBenchmarks() {
+  static BenchmarkMap g_benchmarks;
+  return g_benchmarks;
+}
+
+static int g_name_column_width = 20;
+
+static int Round(int n) {
+  int base = 1;
+  while (base*10 < n) {
+    base *= 10;
+  }
+  if (n < 2*base) {
+    return 2*base;
+  }
+  if (n < 5*base) {
+    return 5*base;
+  }
+  return 10*base;
+}
+
+#ifdef __APPLE__
+  #include <mach/mach_time.h>
+  static mach_timebase_info_data_t g_time_info;
+  static void __attribute__((constructor)) init_info() {
+    mach_timebase_info(&g_time_info);
+  }
+#endif
+
+static int64_t NanoTime() {
+#if defined(__APPLE__)
+  uint64_t t = mach_absolute_time();
+  return t * g_time_info.numer / g_time_info.denom;
+#else
+  struct timespec t;
+  t.tv_sec = t.tv_nsec = 0;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+#endif
+}
+
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+  args_.push_back(arg);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+  const int kRangeMultiplier = 8;
+  if (hi < lo) {
+    int temp = hi;
+    hi = lo;
+    lo = temp;
+  }
+  while (lo < hi) {
+    args_.push_back(lo);
+    lo *= kRangeMultiplier;
+  }
+  // We always run the hi number.
+  args_.push_back(hi);
+  return this;
+}
+
+const char* Benchmark::Name() {
+  return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+  if (argc == 1) {
+    return true;  // With no arguments, we run all benchmarks.
+  }
+  // Otherwise, we interpret each argument as a regular expression and
+  // see if any of our benchmarks match.
+  for (int i = 1; i < argc; i++) {
+    regex_t re;
+    if (regcomp(&re, argv[i], 0) != 0) {
+      fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    }
+    int match = regexec(&re, name_, 0, NULL, 0);
+    regfree(&re);
+    if (match != REG_NOMATCH) {
+      return true;
+    }
+  }
+  return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+  name_ = name;
+  fn_ = fn;
+  fn_range_ = fn_range;
+  if (fn_ == NULL && fn_range_ == NULL) {
+    fprintf(stderr, "%s: missing function\n", name_);
+    exit(EXIT_FAILURE);
+  }
+  gBenchmarks().insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+  if (fn_ != NULL) {
+    RunWithArg(0);
+  } else {
+    if (args_.empty()) {
+      fprintf(stderr, "%s: no args!\n", name_);
+      exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < args_.size(); ++i) {
+      RunWithArg(args_[i]);
+    }
+  }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+  g_flops_processed = 0;
+  g_benchmark_total_time_ns = 0;
+  g_benchmark_start_time_ns = NanoTime();
+  if (fn_ != NULL) {
+    fn_(iterations);
+  } else {
+    fn_range_(iterations, arg);
+  }
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+}
+void Benchmark::RunWithArg(int arg) {
+  // run once in case it's expensive
+  int iterations = 1;
+  RunRepeatedlyWithArg(iterations, arg);
+  while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+    int last = iterations;
+    if (g_benchmark_total_time_ns/iterations == 0) {
+      iterations = 1e9;
+    } else {
+      iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+    }
+    iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+    iterations = Round(iterations);
+    RunRepeatedlyWithArg(iterations, arg);
+  }
+  char throughput[100];
+  throughput[0] = '\0';
+  if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
+    double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
+    double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+    snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
+  }
+  char full_name[100];
+  if (fn_range_ != NULL) {
+    if (arg >= (1<<20)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+    } else if (arg >= (1<<10)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+    } else {
+      snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+    }
+  } else {
+    snprintf(full_name, sizeof(full_name), "%s", name_);
+  }
+  printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+         iterations, g_benchmark_total_time_ns/iterations, throughput);
+  fflush(stdout);
+}
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t x) {
+  g_flops_processed = x;
+}
+void StopBenchmarkTiming() {
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+  g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+  if (g_benchmark_start_time_ns == 0) {
+    g_benchmark_start_time_ns = NanoTime();
+  }
+}
+int main(int argc, char* argv[]) {
+  if (gBenchmarks().empty()) {
+    fprintf(stderr, "No benchmarks registered!\n");
+    exit(EXIT_FAILURE);
+  }
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    int name_width = static_cast<int>(strlen(it->second->Name()));
+    g_name_column_width = std::max(g_name_column_width, name_width);
+  }
+  bool need_header = true;
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    ::testing::Benchmark* b = it->second;
+    if (b->ShouldRun(argc, argv)) {
+      if (need_header) {
+        printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+        fflush(stdout);
+        need_header = false;
+      }
+      b->Run();
+    }
+  }
+  if (need_header) {
+    fprintf(stderr, "No matching benchmarks!\n");
+    fprintf(stderr, "Available benchmarks:\n");
+    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+      fprintf(stderr, "  %s\n", it->second->Name());
+    }
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -4,13 +4,15 @@
 typedef int TensorIndex;
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int

-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "testing/base/public/benchmark.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+  BENCHMARK(bench)->Range(lo, hi)

 using Eigen::Tensor;
 using Eigen::TensorMap;

-
 // TODO(bsteiner): also templatize on the input type since we have users
 // for int8 as well as floats.
 template <typename Device> class BenchmarkSuite {
@ -38,12 +40,26 @@ template <typename Device> class BenchmarkSuite {
      device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
    }
    // Record the number of values copied per second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void typeCasting(int num_iters) {
+    eigen_assert(m_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
+    TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.cast<int>();
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
  }

  void random(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);

    StartBenchmarkTiming();
@ -51,21 +67,21 @@ template <typename Device> class BenchmarkSuite {
      C.device(device_) = C.random();
    }
    // Record the number of random numbers generated per second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
  }

  void slicing(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);

-    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
-    const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
-    const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
-    const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
-    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -80,31 +96,59 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of values copied from the rhs slice to the lhs slice
    // each second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void rowChip(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void colChip(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
  }

  void shuffling(int num_iters) {
    eigen_assert(m_ == n_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);

-    const Eigen::array<int, 2> shuffle(1, 0);
+    const Eigen::array<int, 2> shuffle = {{1, 0}};

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.shuffle(shuffle);
    }
    // Record the number of values shuffled from A and copied to B each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
  }

 void padding(int num_iters) {
    eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);

    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
@ -116,35 +160,34 @@ template <typename Device> class BenchmarkSuite {
      B.device(device_) = A.pad(paddings);
    }
    // Record the number of values copied from the padded tensor A each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
  }

 void striding(int num_iters) {
    eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+    const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);

-    const Eigen::array<TensorIndex, 2> strides(1, 2);
+    const Eigen::array<TensorIndex, 2> strides = {{1, 2}};

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.stride(strides);
    }
    // Record the number of values copied from the padded tensor A each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
  }

  void broadcasting(int num_iters) {
-    const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+    const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);

-#if defined(__CUDACC__)
-    // nvcc doesn't support cxx11
-    const Eigen::array<int, 2> broadcast(1, n_);
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<int, 2> broadcast = {{1, n_}};
 #else
    // Take advantage of cxx11 to give the compiler information it can use to
    // optimize the code.
@ -157,12 +200,12 @@ template <typename Device> class BenchmarkSuite {
      C.device(device_) = A.broadcast(broadcast);
    }
    // Record the number of values broadcasted from A and copied to C each second
-    finalizeBenchmark(m_ * n_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
  }

  void coeffWiseOp(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -173,12 +216,12 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (2 multiplications and
    // 1 addition per value)
-    finalizeBenchmark(3 * m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
  }

  void algebraicFunc(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -189,12 +232,12 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (assuming one operation
    // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
  }

  void transcendentalFunc(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@ -205,17 +248,23 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (assuming one operation
    // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
  }

-  // Simple reduction
-  void reduction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size(k_, n_);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
-    const Eigen::array<TensorIndex, 1> output_size(n_);
-    TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+ // Row reduction
+  void rowReduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);

-    const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -223,21 +272,47 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (assuming one operation
    // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Column reduction
+  void colReduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{k_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
+        c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
  }

  // do a contraction which is equivalent to a matrix multiplication
  void contraction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
-    const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
-    const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+    const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
+    const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
+    const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};

    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);

    typedef typename Tensor<float, 2>::DimensionPair DimPair;
-    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -245,18 +320,18 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (size_ multiplications and
    // additions for each value in the resulting tensor)
-    finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
  }

  void convolution(int num_iters, int kernel_x, int kernel_y) {
-    const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+    const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
-    const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+    const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
-    const Eigen::array<TensorIndex, 2> result_sizes(
-        m_ - kernel_x + 1, n_ - kernel_y + 1);
+    const Eigen::array<TensorIndex, 2> result_sizes =
+        {{m_ - kernel_x + 1, n_ - kernel_y + 1}};
    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
-    Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+    Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};

    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
@ -264,8 +339,8 @@ template <typename Device> class BenchmarkSuite {
    }
    // Record the number of FLOP executed per second (kernel_size
    // multiplications and additions for each value in the resulting tensor)
-    finalizeBenchmark(
-        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(2) *
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
  }

 private:
@ -280,23 +355,23 @@ template <typename Device> class BenchmarkSuite {
    device_.memset(b_, 23, k_ * n_ * sizeof(float));
    device_.memset(c_, 31, m_ * n_ * sizeof(float));

-    BenchmarkUseRealTime();
+    //BenchmarkUseRealTime();
  }

-  inline void finalizeBenchmark(int64 num_items) {
+  inline void finalizeBenchmark(int64_t num_items) {
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
      device_.synchronize();
    }
 #endif
    StopBenchmarkTiming();
-    SetBenchmarkItemsProcessed(num_items);
+    SetBenchmarkFlopsProcessed(num_items);
  }


-  size_t m_;
-  size_t k_;
-  size_t n_;
+  TensorIndex m_;
+  TensorIndex k_;
+  TensorIndex n_;
  float* a_;
  float* b_;
  float* c_;
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@ -1,19 +1,12 @@
 #define EIGEN_USE_THREADS

-#include "base/sysinfo.h"
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-#include "thread/threadpool.h"
+#include <string>
+
+#include "tensor_benchmarks.h"

-#ifdef __ANDROID__
 #define CREATE_THREAD_POOL(threads)             \
-Eigen::ThreadPoolDevice device(threads);
-#else
-#define CREATE_THREAD_POOL(threads)             \
-ThreadPool tp(threads);                         \
-tp.StartWorkers();                              \
-Eigen::ThreadPoolDevice device(&tp, threads);
-#endif
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);

 // Simple functions
 #define BM_FuncCPU(FUNC, THREADS)                                \
@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads);
    CREATE_THREAD_POOL(THREADS);                                 \
    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
    suite.FUNC(iters);                                           \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));    \
  }                                                              \
  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);

@ -30,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
 BM_FuncCPU(memcpy, 8);
 BM_FuncCPU(memcpy, 12);

+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
 BM_FuncCPU(random, 4);
 BM_FuncCPU(random, 8);
 BM_FuncCPU(random, 12);
@ -38,6 +34,14 @@ BM_FuncCPU(slicing, 4);
 BM_FuncCPU(slicing, 8);
 BM_FuncCPU(slicing, 12);

+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
 BM_FuncCPU(shuffling, 4);
 BM_FuncCPU(shuffling, 8);
 BM_FuncCPU(shuffling, 12);
@ -66,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
 BM_FuncCPU(transcendentalFunc, 8);
 BM_FuncCPU(transcendentalFunc, 12);

-BM_FuncCPU(reduction, 4);
-BM_FuncCPU(reduction, 8);
-BM_FuncCPU(reduction, 12);
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);


 // Contractions
@ -84,7 +92,6 @@ BM_FuncCPU(reduction, 12);
      BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
      suite.FUNC(iters);                                                       \
    }                                                                          \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);

@ -107,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);

+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
@ -127,7 +140,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
    CREATE_THREAD_POOL(THREADS);                                               \
    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
    suite.FUNC(iters, DIM1, DIM2);                                             \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);

--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@ -3,47 +3,47 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <iostream>
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-

+#include "tensor_benchmarks.h"

 // Simple functions
 #define BM_FuncGPU(FUNC)                                                       \
  static void BM_##FUNC(int iters, int N) {                                    \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);

 BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
 BM_FuncGPU(random);
 BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
 BM_FuncGPU(shuffling);
 BM_FuncGPU(padding);
 BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
-BM_FuncGPU(reduction);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);


 // Contractions
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);

@ -51,19 +51,18 @@ BM_FuncGPU(reduction);
 BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);


 // Convolutions
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
    StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
    Eigen::GpuDevice device(&stream);                                          \
    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
    cudaDeviceSynchronize();                                                   \
    suite.FUNC(iters, DIM1, DIM2);                                             \
-    cudaStreamDestroy(stream);                                                 \
  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);

--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@ -19,19 +19,12 @@
 int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* x = reinterpret_cast<Scalar*>(px);
@ -111,19 +104,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
 int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap)
 {
  typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* ap = reinterpret_cast<Scalar*>(pap);
@ -162,19 +148,12 @@ int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
  typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
@ -217,19 +196,12 @@ int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda)
 {
  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* a = reinterpret_cast<Scalar*>(pa);
@ -271,19 +243,12 @@ int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
 {
  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@ -26,20 +26,15 @@ struct general_matrix_vector_product_wrapper
 int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
 {
  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
-  static functype func[4];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<4; ++k)
-      func[k] = 0;
-
-    func[NOTR] = (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run);
-    func[TR  ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run);
-    func[ADJ ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run);
-
-    init = true;
-  }
+  static const functype func[4] = {
+    // array index: NOTR
+    (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run),
+    // array index: TR  
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run),
+    // array index: ADJ 
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -90,32 +85,36 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
 int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
  typedef void (*functype)(int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -145,32 +144,36 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
  typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -346,32 +349,36 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
 int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
 {
  typedef void (*functype)(int, int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<16; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* x = reinterpret_cast<Scalar*>(px);
@ -416,32 +423,36 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real
 int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
  typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };

  Scalar* ap = reinterpret_cast<Scalar*>(pap);
  Scalar* x = reinterpret_cast<Scalar*>(px);
@ -487,32 +498,36 @@ int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
  typedef void (*functype)(int, const Scalar*, Scalar*);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };

  Scalar* ap = reinterpret_cast<Scalar*>(pap);
  Scalar* x = reinterpret_cast<Scalar*>(px);
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@ -13,19 +13,12 @@
 int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* x = reinterpret_cast<Scalar*>(px);
@ -71,34 +64,13 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
 int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc)
 {

-//   typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-
-//     init = true;
-//   }
  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* c = reinterpret_cast<Scalar*>(pc);
@ -131,34 +103,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 // C := alpha*x*y' + alpha*y*x' + C
 int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc)
 {
-//   typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-//
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-//
-//     init = true;
-//   }
  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
@ -234,19 +185,12 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap)
 {
  typedef void (*functype)(int, Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* ap = reinterpret_cast<Scalar*>(pap);
@ -285,19 +229,12 @@ int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *in
 int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
  typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };

  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@ -13,24 +13,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
 {
 //   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
  typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&, Eigen::internal::GemmParallelInfo<DenseIndex>*);
-  static functype func[12];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<12; ++i)
-      func[i] = 0;
-    func[NOTR  | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[TR    | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run);
-    func[NOTR  | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[TR    | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run);
-    func[NOTR  | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[TR    | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[ADJ   | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run);
-    init = true;
-  }
+  static const functype func[12] = {
+    // array index: NOTR  | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: TR    | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: TR    | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: ADJ   | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -73,49 +78,64 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
 {
 //   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<32; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    init = true;
-  }
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run),\
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -162,47 +182,64 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 {
 //   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
  typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<32; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    init = true;
-  }
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* b = reinterpret_cast<Scalar*>(pb);
@ -275,9 +312,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
    return 1;
  }

+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
  #if ISCOMPLEX
  // FIXME add support for symmetric complex matrix
-  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
  Matrix<Scalar,Dynamic,Dynamic,ColMajor> matA(size,size);
  if(UPLO(*uplo)==UP)
  {
@ -294,13 +331,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
  else if(SIDE(*side)==RIGHT)
    matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA;
  #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
  if(SIDE(*side)==LEFT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
    else                      return 0;
  else if(SIDE(*side)==RIGHT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
    else                      return 0;
  else
    return 0;
@ -315,25 +354,23 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 {
 //   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
  #if !ISCOMPLEX
-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<8; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run);
-    func[TR    | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run);
-    func[TR    | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run);
-
-    init = true;
-  }
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run),
+    // array index: TR    | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run),
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run),
+    // array index: TR    | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run),
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run),
+    0
+  };
  #endif

  Scalar* a = reinterpret_cast<Scalar*>(pa);
@ -381,8 +418,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
      matrix(c, *n, *n, *ldc).triangularView<Lower>() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda);
  }
  #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+
  int code = OP(*op) | (UPLO(*uplo) << 2);
-  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
  #endif

  return 0;
@ -486,20 +525,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
    return 1;
  }

+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
  if(SIDE(*side)==LEFT)
  {
    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar,DenseIndex,RowMajor,true,Conj,  ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,true,false, ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
    else                      return 0;
  }
  else if(SIDE(*side)==RIGHT)
  {
    if(UPLO(*uplo)==UP)       matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView<Upper>();/*internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, RowMajor,true,Conj,  ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/
    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, ColMajor,true,false, ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
    else                      return 0;
  }
  else
@ -516,23 +558,21 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 {
 //   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";

-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<8; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run);
-
-    init = true;
-  }
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run),
+    0,
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run),
+    0,
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run),
+    0
+  };

  Scalar* a = reinterpret_cast<Scalar*>(pa);
  Scalar* c = reinterpret_cast<Scalar*>(pc);
@ -571,7 +611,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp

  if(*k>0 && alpha!=RealScalar(0))
  {
-    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
    matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
  }
  return 0;
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@ -44,7 +44,7 @@ C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
 VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
 v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
 VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
-                    low,low+step*(size-1))
+                    low,low+step*(size-1))  //


 // Matrix slicing and blocks. All expressions listed here are read/write.
@ -94,6 +94,8 @@ R.transpose()                      // R.' or conj(R')       // Read-write
 R.diagonal()                       // diag(R)               // Read-write
 x.asDiagonal()                     // diag(x)
 R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.rowwise().reverse()              // fliplr(R)
+R.colwise().reverse()              // flipud(R)
 R.replicate(i,j)                   // repmat(P,i,j)


@ -139,6 +141,7 @@ R.cwiseAbs2()                  // abs(P.^2)
 R.array().abs2()               // abs(P.^2)
 (R.array() < s).select(P,Q );  // (R < s ? P : Q)
 R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0)
+R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P)   // with: scalar func(const scalar &x);


 // Reductions.
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@ -65,17 +65,17 @@ They are summarized in the following tables:
    <td>Requires the <a href="http://pastix.gforge.inria.fr">PaStiX</a> package, \b CeCILL-C </td>
    <td>optimized for tough problems and symmetric patterns</td></tr>
 <tr><td>CholmodSupernodalLLT</td><td>\link CholmodSupport_Module CholmodSupport \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
    <td></td></tr>
 <tr><td>UmfPackLU</td><td>\link UmfPackSupport_Module UmfPackSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
    <td></td></tr>
 <tr><td>SuperLU</td><td>\link SuperLUSupport_Module SuperLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
    <td>Requires the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library, (BSD-like)</td>
    <td></td></tr>
 <tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
    <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
-    <td> requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+    <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
 </table>

 Here \c SPD means symmetric positive definite.
--- a/doc/TopicAliasing.dox
+++ b/doc/TopicAliasing.dox
@ -153,10 +153,11 @@ not necessary to evaluate the right-hand side explicitly.

 \section TopicAliasingMatrixMult Aliasing and matrix multiplication

-Matrix multiplication is the only operation in %Eigen that assumes aliasing by default. Thus, if \c matA is a
-matrix, then the statement <tt>matA = matA * matA;</tt> is safe. All other operations in %Eigen assume that
-there are no aliasing problems, either because the result is assigned to a different matrix or because it is a
-component-wise operation.
+Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, <strong>under the
+condition that the destination matrix is not resized</strong>.
+Thus, if \c matA is a \b squared matrix, then the statement <tt>matA = matA * matA;</tt> is safe.
+All other operations in %Eigen assume that there are no aliasing problems,
+either because the result is assigned to a different matrix or because it is a component-wise operation.

 <table class="example">
 <tr><th>Example</th><th>Output</th></tr>
@ -198,6 +199,27 @@ may get wrong results:
 \verbinclude TopicAliasing_mult3.out
 </td></tr></table>

+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination.
+Therefore, the following example is also wrong:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult4.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult4.out
+</td></tr></table>
+
+As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment:
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult5.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult5.out
+</td></tr></table>

 \section TopicAliasingSummary Summary

--- a/doc/TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/TutorialReductionsVisitorsBroadcasting.dox
@ -101,17 +101,16 @@ row and column position are to be stored. These variables should be of type
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out
 </td></tr></table>

-Note that both functions also return the value of the minimum or maximum coefficient if needed,
-as if it was a typical reduction operation.
+Both functions also return the value of the minimum or maximum coefficient.

 \section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions
 Partial reductions are reductions that can operate column- or row-wise on a Matrix or 
 Array, applying the reduction operation on each column or row and 
-returning a column or row-vector with the corresponding values. Partial reductions are applied 
+returning a column or row vector with the corresponding values. Partial reductions are applied 
 with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink.

 A simple example is obtaining the maximum of the elements 
-in each column in a given matrix, storing the result in a row-vector:
+in each column in a given matrix, storing the result in a row vector:

 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@ -133,8 +132,7 @@ The same operation can be performed row-wise:
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
 </td></tr></table>

-<b>Note that column-wise operations return a 'row-vector' while row-wise operations
-return a 'column-vector'</b>
+<b>Note that column-wise operations return a row vector, while row-wise operations return a column vector.</b>

 \subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations
 It is also possible to use the result of a partial reduction to do further processing.
@ -176,7 +174,7 @@ The concept behind broadcasting is similar to partial reductions, with the diffe
 constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in 
 one direction.

-A simple example is to add a certain column-vector to each column in a matrix. 
+A simple example is to add a certain column vector to each column in a matrix. 
 This can be accomplished with:

 <table class="example">
@ -253,7 +251,7 @@ is a new matrix whose size is the same as matrix <tt>m</tt>: \f[
 \f]

  - <tt>(m.colwise() - v).colwise().squaredNorm()</tt> is a partial reduction, computing the squared norm column-wise. The result of
-this operation is a row-vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
+this operation is a row vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
  \mbox{(m.colwise() - v).colwise().squaredNorm()} =
  \begin{bmatrix}
     1 & 505 & 32 & 50
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@ -257,7 +257,14 @@ Binary coefficient wise operators can also mix sparse and dense expressions:
 \code
 sm2 = sm1.cwiseProduct(dm1);
 dm2 = sm1 + dm1;
+dm2 = dm1 - sm1;
 \endcode
+Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing <tt>dm2 = sm1 + dm1</tt>, better write:
+\code
+dm2 = dm1;
+dm2 += sm1;
+\endcode
+This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only.


 %Sparse expressions also support transposition:
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@ -52,7 +52,7 @@ When doing so, a number of Eigen's algorithms are silently substituted with call
 These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
 Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.

-In addition you can coarsely select choose which parts will be substituted by defining one or multiple of the following macros:
+In addition you can choose which parts will be substituted by defining one or multiple of the following macros:

 <table class="manual">
 <tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)</td></tr>
--- a/doc/snippets/TopicAliasing_mult4.cpp
+++ b/doc/snippets/TopicAliasing_mult4.cpp
@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).cwiseAbs();
+cout << A;
--- a/doc/snippets/TopicAliasing_mult5.cpp
+++ b/doc/snippets/TopicAliasing_mult5.cpp
@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).eval().cwiseAbs();
+cout << A;
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@ -45,12 +45,14 @@ template<> struct adjoint_specific<false> {

    // check null inputs
    VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE)
    RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
    VERIFY( (v1*very_small).norm() == 0 );
    VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
    v3 = v1*very_small;
    v3.normalize();
    VERIFY_IS_APPROX(v3, (v1*very_small));
+#endif
    
    // check compatibility of dot and adjoint
    ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));
--- a/test/array.cpp
+++ b/test/array.cpp
@ -219,6 +219,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
 #ifdef EIGEN_HAS_C99_MATH
  VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+  VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
  VERIFY_IS_APPROX(m1.erf(), erf(m1));
  VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
 #endif  // EIGEN_HAS_C99_MATH
@ -309,7 +310,22 @@ template<typename ArrayType> void array_real(const ArrayType& m)
  s1 += Scalar(tiny);
  m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
  VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
-  
+
+  // check special functions (comparing against numpy implementation)
+#ifdef EIGEN_HAS_C99_MATH
+  if (!NumTraits<Scalar>::IsComplex) {
+    VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(4)), RealScalar(1.2561176684318));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(-10.5)), RealScalar(2.398239129535781));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(10000.5)), RealScalar(9.210340372392849));
+    VERIFY_IS_EQUAL(numext::digamma(Scalar(0)),
+                    std::numeric_limits<RealScalar>::infinity());
+    VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)),
+                    std::numeric_limits<RealScalar>::infinity());
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
  // check inplace transpose
  m3 = m1;
  m3.transposeInPlace();
@ -336,8 +352,6 @@ template<typename ArrayType> void array_complex(const ArrayType& m)

  Array<RealScalar, -1, -1> m3(rows, cols);

-  Scalar  s1 = internal::random<Scalar>();
-
  for (Index i = 0; i < m.rows(); ++i)
    for (Index j = 0; j < m.cols(); ++j)
      m2(i,j) = sqrt(m1(i,j));
@ -410,6 +424,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
  VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);

  // scalar by array division
+  Scalar  s1 = internal::random<Scalar>();
  const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
  s1 += Scalar(tiny);
  m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@ -68,6 +68,16 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
  const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0);
  VERIFY(&ref_a1 == &ref_m1);
  VERIFY(&ref_a2 == &ref_m2);
+
+  // Check write accessors:
+  m1.array().coeffRef(0,0) = 1;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(1));
+  m1.array()(0,0) = 2;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(2));
+  m1.array().matrix().coeffRef(0,0) = 3;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(3));
+  m1.array().matrix()(0,0) = 4;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(4));
 }

 template<typename MatrixType> void comparisons(const MatrixType& m)
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp
@ -20,6 +20,8 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
  MatrixType m1 = MatrixType::Random(rows, cols),
             m2 = MatrixType::Random(rows, cols);

+  Scalar s1 = internal::random<Scalar>();
+
  //check diagonal()
  VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
  m2.diagonal() = 2 * m1.diagonal();
@ -58,6 +60,11 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
    VERIFY_IS_APPROX(m2.template diagonal<N2>(), static_cast<Scalar>(2) * m1.diagonal(N2));
    m2.diagonal(N2)[0] *= 3;
    VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+
+    m2.diagonal(N2).x() = s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1);
+    m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1);
  }
 }

--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -34,4 +34,32 @@ void test_incomplete_cholesky()
  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+#ifdef EIGEN_TEST_PART_1
+    // regression for bug 1150
+  for(int N = 1; N<20; ++N)
+  {
+    Eigen::MatrixXd b( N, N );
+    b.setOnes();
+
+    Eigen::SparseMatrix<double> m( N, N );
+    m.reserve(Eigen::VectorXi::Constant(N,4));
+    for( int i = 0; i < N; ++i )
+    {
+        m.insert( i, i ) = 1;
+        m.coeffRef( i, i / 2 ) = 2;
+        m.coeffRef( i, i / 3 ) = 2;
+        m.coeffRef( i, i / 4 ) = 2;
+    }
+
+    Eigen::SparseMatrix<double> A;
+    A = m * m.transpose();
+
+    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>,
+        Eigen::Lower | Eigen::Upper,
+        Eigen::IncompleteCholesky<double> > solver( A );
+    VERIFY(solver.preconditioner().info() == Eigen::Success);
+    VERIFY(solver.info() == Eigen::Success);
+  }
+#endif
 }
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@ -44,6 +44,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  Mat_d md    = mf.template cast<double>();
  Mat_cf mcf  = Mat_cf::Random(size,size);
  Mat_cd mcd  = mcf.template cast<complex<double> >();
+  Mat_cd rcd = mcd;
  Vec_f vf    = Vec_f::Random(size,1);
  Vec_d vd    = vf.template cast<double>();
  Vec_cf vcf  = Vec_cf::Random(size,1);
@ -103,24 +104,23 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast<std::complex<double> >());
  
  // check matrix-matrix products
-
  VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast<CD>().eval()*mcd);
  VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast<CD>());
  VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast<CD>().eval()*mcd);
  VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast<CD>());
-  
+
  VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast<CF>()*mcf);
  VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast<CF>());
  VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast<CF>()*mcf);
  VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast<CF>());
-  
+
  VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast<CD>().eval().adjoint()*mcd);
  VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast<CD>());
  VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast<CD>().eval().adjoint()*mcd.adjoint());
  VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast<CD>().adjoint());
  VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast<CD>().eval()*mcd.adjoint());
  VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast<CD>().adjoint());
-  
+
  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast<CF>().eval().adjoint()*mcf);
  VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast<CF>());
  VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast<CF>().eval().adjoint()*mcf.adjoint());
@ -147,6 +147,39 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
  VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast<CD>().eval());
  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
+
+  VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: trmm
+//   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*mcd*md.template triangularView<Upper>(), scd*mcd*md.template cast<CD>().eval().template triangularView<Upper>());
+//   VERIFY_IS_APPROX(sd*md*mcd.template triangularView<Lower>(),  sd*md.template cast<CD>().eval()*mcd.template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*md*mcd.template triangularView<Upper>(), scd*md.template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: symv
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  // Not supported yet: symm
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Upper>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Upper>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  rcd.setZero();
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * mcd * md),
+                   Mat_cd((sd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * md * mcd),
+                   Mat_cd((sd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * mcd * md),
+                   Mat_cd((scd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * md * mcd),
+                   Mat_cd((scd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
 }

 void test_mixingtypes()
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@ -78,14 +78,15 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
  VERIFY_IS_APPROX(m2,m2);
  
  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0),-1);
-  m2.template selfadjointView<Lower>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Upper>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0), m1.col(0)); // rank-2

  // The following fancy matrix-matrix products are not safe yet regarding static allocation
-//   m1 += m1.template triangularView<Upper>() * m2.col(;
-//   m1.template selfadjointView<Lower>().rankUpdate(m2);
-//   m1 += m1.template triangularView<Upper>() * m2;
-//   m1 += m1.template selfadjointView<Lower>() * m2;
-//   VERIFY_IS_APPROX(m1,m1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1);
+  m2 += m2.template triangularView<Upper>() * m1;
+  m2.template triangularView<Upper>() = m2 * m2;
+  m1 += m1.template selfadjointView<Lower>() * m2;
+  VERIFY_IS_APPROX(m2,m2);
 }

 template<typename Scalar>
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@ -48,30 +48,32 @@ void testVectorType(const VectorType& base)
  VectorType m(base);
  m.setLinSpaced(size,low,high);

+  if(!NumTraits<Scalar>::IsInteger)
+  {
+    VectorType n(size);
+    for (int i=0; i<size; ++i)
+      n(i) = low+i*step;
+    VERIFY_IS_APPROX(m,n);
+  }
+
  VectorType n(size);
  for (int i=0; i<size; ++i)
-    n(i) = low+i*step;
-
+    n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/(size-1));
  VERIFY_IS_APPROX(m,n);

  // random access version
  m = VectorType::LinSpaced(size,low,high);
  VERIFY_IS_APPROX(m,n);

-  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
-  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<Scalar>::epsilon() );
-
-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( internal::isApprox(m(m.size()-1),high) );
+  VERIFY( size==1 || internal::isApprox(m(0),low) );

  // sequential access version
  m = VectorType::LinSpaced(Sequential,size,low,high);
  VERIFY_IS_APPROX(m,n);

-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( internal::isApprox(m(m.size()-1),high) );
+  VERIFY( size==1 || internal::isApprox(m(0),low) );

  // check whether everything works with row and col major vectors
  Matrix<Scalar,Dynamic,1> row_vector(size);
@ -126,5 +128,13 @@ void test_nullary()
    CALL_SUBTEST_8( testVectorType(Vector4f()) );
    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
  }
+
+#ifdef EIGEN_TEST_PART_6
+  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
+  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
+#endif
 }
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@ -192,6 +192,11 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
 //     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);

+    VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3);
+    VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
+    VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
+    VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
+
    // test aliasing
    VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
    VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
@ -455,6 +460,33 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    refMat1.setIdentity();
    VERIFY_IS_APPROX(m1, refMat1);
  }
+
+  // test array/vector of InnerIterator
+  {
+    typedef typename SparseMatrixType::InnerIterator IteratorType;
+
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    IteratorType static_array[2];
+    static_array[0] = IteratorType(m2,0);
+    static_array[1] = IteratorType(m2,m2.outerSize()-1);
+    VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 );
+    VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 );
+    if(static_array[0] && static_array[1])
+    {
+      ++(static_array[1]);
+      static_array[1] = IteratorType(m2,0);
+      VERIFY( static_array[1] );
+      VERIFY( static_array[1].index() == static_array[0].index() );
+      VERIFY( static_array[1].outer() == static_array[0].outer() );
+      VERIFY( static_array[1].value() == static_array[0].value() );
+    }
+
+    std::vector<IteratorType> iters(2);
+    iters[0] = IteratorType(m2,0);
+    iters[1] = IteratorType(m2,m2.outerSize()-1);
+  }
 }


--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@ -9,14 +9,14 @@

 #include "sparse.h"

-template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
+template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
 {
  double densityMat = (std::max)(8./(rows*cols), 0.01);
  double densityVec = (std::max)(8./float(rows), 0.1);
  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
  typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef SparseVector<Scalar,0,Index> SparseVectorType;
-  typedef SparseMatrix<Scalar,0,Index> SparseMatrixType;
+  typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
+  typedef SparseMatrix<Scalar,0,StorageIndex> SparseMatrixType;
  Scalar eps = 1e-6;

  SparseMatrixType m1(rows,rows);
@ -87,8 +87,10 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)

  VERIFY_IS_APPROX(m1*v2, refM1*refV2);
  VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2));
-  int i = internal::random<int>(0,rows-1);
-  VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  {
+    int i = internal::random<int>(0,rows-1);
+    VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  }


  VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm());
@ -111,15 +113,51 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
  VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); 
  VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); 

+  // test conservative resize
+  {
+    std::vector<StorageIndex> inc;
+    if(rows > 3)
+      inc.push_back(-3);
+    inc.push_back(0);
+    inc.push_back(3);
+    inc.push_back(1);
+    inc.push_back(10);
+
+    for(std::size_t i = 0; i< inc.size(); i++) {
+      StorageIndex incRows = inc[i];
+      SparseVectorType vec1(rows);
+      DenseVector refVec1 = DenseVector::Zero(rows);
+      initSparse<Scalar>(densityVec, refVec1, vec1);
+
+      vec1.conservativeResize(rows+incRows);
+      refVec1.conservativeResize(rows+incRows);
+      if (incRows > 0) refVec1.tail(incRows).setZero();
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+
+      // Insert new values
+      if (incRows > 0)
+        vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1;
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+    }
+  }
+
 }

 void test_sparse_vector()
 {
  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+
    CALL_SUBTEST_1(( sparse_vector<double,int>(8, 8) ));
-    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(16, 16) ));
-    CALL_SUBTEST_1(( sparse_vector<double,long int>(299, 535) ));
-    CALL_SUBTEST_1(( sparse_vector<double,short>(299, 535) ));
+    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,long int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,short>(r, c) ));
  }
 }

--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@ -163,6 +163,21 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
    VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
  }
+
+  // stableNormalize[d]
+  {
+    VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized());
+    MatrixType vcopy(vrand);
+    vcopy.stableNormalize();
+    VERIFY_IS_APPROX(vcopy, vrand.normalized());
+    VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1));
+    RealScalar big_scaling = ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+    VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling);
+    VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized());
+  }
 }

 void test_stable_norm()
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@ -210,6 +210,9 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm<Infinity>());
  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm<Infinity>());

+  // regression for bug 1158
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum());
+
  // test normalized
  m2 = m1.colwise().normalized();
  VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
--- a/test/zerosized.cpp
+++ b/test/zerosized.cpp
@ -25,6 +25,7 @@ template<typename MatrixType> void zeroReduction(const MatrixType& m) {
 template<typename MatrixType> void zeroSizedMatrix()
 {
  MatrixType t1;
+  typedef typename MatrixType::Scalar Scalar;

  if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0)
  {
@ -45,6 +46,23 @@ template<typename MatrixType> void zeroSizedMatrix()
      VERIFY(t1==t2);
    }
  }
+
+  if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0)
+  {
+    Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : MatrixType::RowsAtCompileTime;
+    Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : MatrixType::ColsAtCompileTime;
+    MatrixType m(rows,cols);
+    zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols));
+    zeroReduction(m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0));
+    zeroReduction(m.template block<0,1>(0,0));
+    zeroReduction(m.template block<1,0>(0,0));
+    Matrix<Scalar,Dynamic,Dynamic> prod = m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols);
+    VERIFY(prod.rows()==rows && prod.cols()==cols);
+    VERIFY(prod.isZero());
+    prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0);
+    VERIFY(prod.size()==1);
+    VERIFY(prod.isZero());
+  }
 }

 template<typename VectorType> void zeroSizedVector()
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3
@ -188,7 +188,7 @@ template<typename _Scalar> class AlignedVector3
    }

    template<typename Derived>
-    inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
+    inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
    {
      return m_coeffs.template head<3>().isApprox(other,eps);
    }
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@ -25,6 +25,16 @@ template <typename T, size_t n> class array {
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }

+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
  static std::size_t size() { return n; }

@ -123,13 +133,33 @@ template <typename T> class array<T, 0> {
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE T& operator[] (size_t) {
    eigen_assert(false && "Can't index a zero size array");
-    return *static_cast<T*>(NULL);
+    return dummy;
  }
-
  EIGEN_DEVICE_FUNC
  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
    eigen_assert(false && "Can't index a zero size array");
-    return *static_cast<const T*>(NULL);
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
  }

  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
@ -142,6 +172,9 @@ template <typename T> class array<T, 0> {
    eigen_assert(l.size() == 0);
  }
 #endif
+
+ private:
+  T dummy;
 };

 namespace internal {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -128,6 +128,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
    }

+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+    digamma() const {
+      return unaryExpr(internal::scalar_digamma_op<Scalar>());
+    }
+
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
    erf() const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase
  }

  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalGemv(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
    const Index rows = m_i_size;
    const Index cols = m_k_size;

@ -516,7 +516,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
      Base(op, device) { }

  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalProduct(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
    if (this->m_j_size == 1) {
      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
      return;
@ -582,10 +582,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

    OutputMapper output(buffer, m);

-    typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
-
    // Sizes of the blocks to load in cache. See the Goto paper for details.
-    BlockingType blocking(m, n, k, 1, true);
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
    const Index kc = blocking.kc();
    const Index mc = numext::mini(m, blocking.mc());
    const Index nc = numext::mini(n, blocking.nc());
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@ -28,7 +28,7 @@ class TensorContractionBlocking {
  typedef typename LhsMapper::Scalar LhsScalar;
  typedef typename RhsMapper::Scalar RhsScalar;

-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
      kc_(k), mc_(m), nc_(n)
  {
    if (ShardingType == ShardByCol) {
@ -41,9 +41,9 @@ class TensorContractionBlocking {
    }
  }

-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }

 private:
  Index kc_;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -426,15 +426,16 @@ class TensorContractionSubMapper {
 };


-template<typename Scalar, typename Index, int side,
+template<typename Scalar_, typename Index, int side,
         typename Tensor,
         typename nocontract_t, typename contract_t,
         int packet_size,
         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {

 public:
+  typedef Scalar_ Scalar;
  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
  typedef SubMapper VectorMapper;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -176,10 +176,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

    // compute block sizes (which depend on number of threads)
    const Index num_threads = this->m_device.numThreads();
-    Index mc = m;
-    Index nc = n;
-    Index kc = k;
-    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
+    Index mc = blocking.mc();
+    Index nc = blocking.nc();
+    Index kc = blocking.kc();
    eigen_assert(mc <= m);
    eigen_assert(nc <= n);
    eigen_assert(kc <= k);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -21,7 +21,7 @@ namespace Eigen {
  */
 namespace internal {

-template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
 class IndexMapper {
 public:
  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
@ -123,7 +123,7 @@ class IndexMapper {
      }
      inputIndex += p * m_inputStrides[NumKernelDims];
    } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
      if (NumKernelDims < NumDims) {
        limit = NumDims - NumKernelDims - 1;
      }
@ -147,7 +147,7 @@ class IndexMapper {
      }
      outputIndex += p * m_outputStrides[NumKernelDims];
    } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
      if (NumKernelDims < NumDims) {
        limit = NumDims - NumKernelDims - 1;
      }
@ -206,7 +206,7 @@ class IndexMapper {
  }

 private:
-  static const size_t NumDims = internal::array_size<InputDims>::value;
+  static const int NumDims = internal::array_size<InputDims>::value;
  array<Index, NumDims> m_inputStrides;
  array<Index, NumDims> m_outputStrides;
  array<Index, NumDims> m_cudaInputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
  // The StreamInterface is not owned: the caller is
  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
    eigen_assert(stream);
  }
-
  // TODO(bsteiner): This is an internal API, we should not expose it.
  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
    return stream_->stream();
@ -246,6 +248,10 @@ struct GpuDevice {
 #endif
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
  // This function checks if the CUDA runtime recorded an error for the
  // underlying stream device.
  inline bool ok() const {
@ -259,7 +265,7 @@ struct GpuDevice {

 private:
  const StreamInterface* stream_;
-
+  int max_blocks_;
 };

 #ifndef __CUDA_ARCH__
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
  }

  template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
  if (needs_assign)
  {
    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
    const Index size = array_prod(evaluator.dimensions());
    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -106,7 +106,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    m_impl.evalSubExprsIfNeeded(NULL);
    const Index numValues = m_impl.dimensions().TotalSize();
    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
    // Should initialize the memory in case we're dealing with non POD types.
@ -119,7 +118,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
    EvalTo evalToTmp(m_buffer, m_op);
    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
    internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
-    m_impl.cleanup();
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
--- a/Show More
+++ b/Show More