Pulled latest updates from trunk

2025-08-17 22:25:55 +08:00 · 2015-04-01 23:24:11 -07:00 · 2015-04-01 23:24:11 -07:00 · 74e558cfa8
commit 74e558cfa8
parent 731d7b84b4 03a0df2010
52 changed files with 1088 additions and 627 deletions
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@ -17,7 +17,7 @@
  *
  * These iterative solvers are associated with some preconditioners:
  *  - IdentityPreconditioner - not really useful
-  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
  *
  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -300,9 +300,10 @@ template<typename Derived>
 bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
        return false;
  return true;
 }
@ -484,9 +485,10 @@ DenseBase<Derived>::Zero()
 template<typename Derived>
 bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
        return false;
  return true;
 }
@ -719,18 +721,19 @@ template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
  {
    for(Index i = 0; i < rows(); ++i)
    {
      if(i == j)
      {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
          return false;
      }
      else
      {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
          return false;
      }
    }
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -35,22 +35,22 @@ void check_static_allocation_size()
 }

 template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
-         bool Match     = bool((Size%unpacket_traits<Packet>::size)==0),
-         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > Size)
+         bool Match     =  bool((Size%unpacket_traits<Packet>::size)==0),
+         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > 1)
                        && bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) >
 struct compute_default_alignment
 {
  enum { value = 0 };
 };

-template<typename T, int Size, typename Packet>
-struct compute_default_alignment<T, Size, Packet, true, false> // Match
+template<typename T, int Size, typename Packet, bool TryHalf>
+struct compute_default_alignment<T, Size, Packet, true, TryHalf> // Match
 {
  enum { value = sizeof(T) * unpacket_traits<Packet>::size };
 };

 template<typename T, int Size, typename Packet>
-struct compute_default_alignment<T, Size, Packet, false, true>
+struct compute_default_alignment<T, Size, Packet, false, true> // Try-half
 {
  // current packet too large, try with an half-packet
  enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value };
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -224,13 +224,13 @@ bool MatrixBase<Derived>::isOrthogonal
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index i = 0; i < cols(); ++i)
  {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
      return false;
    for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
        return false;
  }
  return true;
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -328,6 +328,7 @@ struct hypot_impl
      p = _y;
      qp = _x / p;
    }
+    if(p==RealScalar(0)) return RealScalar(0);
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -409,7 +409,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    
    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

@ -484,7 +485,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
  {
    PacketScalar res;
    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize-1 : Dynamic,
+                                     Unroll ? InnerSize : Dynamic,
                                     LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;

    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
@ -527,7 +528,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
  }
 };

@ -537,12 +538,12 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
  }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
@ -551,7 +552,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
@ -559,14 +560,31 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
  }
 };

+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
  }
 };
@ -576,9 +594,8 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
 };
@ -678,8 +695,7 @@ public:
    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
-            //(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
+    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0)
  };
  
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@ -200,17 +200,82 @@ DenseBase<Derived>::reverse() const
  * In most cases it is probably better to simply use the reversed expression
  * of a matrix. However, when reversing the matrix data itself is really needed,
  * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
  *  - less error prone: doing the same operation with .reverse() requires special care:
  *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
  *  - it allows future optimizations (cache friendliness, etc.)
  *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
 inline void DenseBase<Derived>::reverseInPlace()
 {
-  derived() = derived().reverse().eval();
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+  
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.rows()/2;
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.cols()/2;
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }

 } // end namespace Eigen
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@ -38,13 +38,17 @@ public:
  template<int StoreMode, int LoadMode>
  void assignPacket(Index row, Index col)
  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
+    PacketScalar tmp = m_src.template packet<LoadMode>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
  }
  
  template<int StoreMode, int LoadMode>
  void assignPacket(Index index)
  {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
+    PacketScalar tmp = m_src.template packet<LoadMode>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
  }
  
  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -562,6 +562,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
    void normalize() {
      m_matrix = this->normalized();
    }
+    
+    inline void reverseInPlace();

 /////////// Geometry module ///////////

--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@ -150,14 +150,6 @@ template<typename Scalar> struct swap_assign_op {
    swap(a,const_cast<Scalar&>(b));
 #endif
  }
-  
-  template<int LhsAlignment, int RhsAlignment, typename Packet>
-  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
-  {
-    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
-    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
-    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
-  }
 };
 template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -249,10 +249,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
        actual_lm = l2;
        max_mc = 576;
      }
-      
      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
      if (mc > Traits::mr) mc -= mc % Traits::mr;
-      
+      else if (mc==0) return;
      m = (m%mc)==0 ? mc
                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
    }
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@ -79,6 +79,14 @@ template <typename LhsScalar,
          typename RhsScalar>
 bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
 {
+  if (num_threads > 1) {
+    // We don't currently have lookup tables recorded for multithread performance,
+    // and we have confirmed experimentally that our single-thread-recorded LUTs are
+    // poor for multithread performance, and our LUTs don't currently contain
+    // any annotation about multithread status (FIXME - we need that).
+    // So for now, we just early-return here.
+    return false;
+  }
  return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
 }

--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -213,7 +213,8 @@
 #endif

 /// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
-#if defined(__ANDROID__)
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
  #define EIGEN_OS_ANDROID 1
 #else
  #define EIGEN_OS_ANDROID 0
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@ -417,7 +417,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
        {
          Scalar t0 = m_matT.coeff(i+1, i);
          Scalar t1 = m_matT.coeff(i, i+1);
-          Scalar maxval = numext::maxi(abs(p),numext::maxi(abs(t0),abs(t1)));
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
          t0 /= maxval;
          t1 /= maxval;
          Scalar p0 = p/maxval;
@ -608,7 +608,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
          }

          // Overflow control
-          Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
          if ((eps * t) * t > Scalar(1))
            m_matT.block(i, n-1, size-i, 2) /= t;

--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@ -240,10 +240,10 @@ namespace Eigen {
            m_S.coeffRef(i,j) = Scalar(0.0);
            m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
            m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
+            // update Q
+            if (m_computeQZ)
+              m_Q.applyOnTheRight(i-1,i,G);
          }
-          // update Q
-          if (m_computeQZ)
-            m_Q.applyOnTheRight(i-1,i,G);
          // kill T(i,i-1)
          if(m_T.coeff(i,i-1)!=Scalar(0))
          {
@ -251,10 +251,10 @@ namespace Eigen {
            m_T.coeffRef(i,i-1) = Scalar(0.0);
            m_S.applyOnTheRight(i,i-1,G);
            m_T.topRows(i).applyOnTheRight(i,i-1,G);
+            // update Z
+            if (m_computeQZ)
+              m_Z.applyOnTheLeft(i,i-1,G.adjoint());
          }
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(i,i-1,G.adjoint());
        }
      }
    }
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@ -161,8 +161,8 @@ class QuaternionBase : public RotationBase<Derived, 3>
  bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
  { return coeffs().isApprox(other.coeffs(), prec); }

-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;

  /** \returns \c *this with scalar type casted to \a NewScalarType
    *
@ -462,7 +462,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni
  */
 template <class Derived>
 EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
-QuaternionBase<Derived>::_transformVector(Vector3 v) const
+QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 {
    // Note that this algorithm comes from the optimization by hand
    // of the conversion to a Matrix followed by a Matrix/Vector product.
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@ -84,6 +84,8 @@ public:
  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
  typedef Array<Index,1,Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;

  /** \brief Default Constructor.
   *
@ -159,21 +161,23 @@ private:
  void allocate(Index rows, Index cols, unsigned int computationOptions);
  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
-  void computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, VectorType& singVals, ArrayXr& shifts, ArrayXr& mus);
-  void perturbCol0(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat);
-  void computeSingVecs(const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
  void deflation43(Index firstCol, Index shift, Index i, Index size);
  void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
  template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
  void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev);
-  static void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
-  static RealScalar secularEq(RealScalar x, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift);
+  void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift);

 protected:
  MatrixXr m_naiveU, m_naiveV;
  MatrixXr m_computed;
  Index m_nRec;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
  int m_algoswap;
  bool m_isTranspose, m_compU, m_compV;
  
@ -212,6 +216,9 @@ void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computati
  else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
  
  if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
+  
+  m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
+  m_workspaceI.resize(3*m_diagSize);
 }// end allocate

 template<typename MatrixType>
@ -223,6 +230,19 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
  allocate(matrix.rows(), matrix.cols(), computationOptions);
  using std::abs;
  
+  //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
+  if(matrix.cols() < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
+    if(computeU()) m_matrixU = jsvd.matrixU();
+    if(computeV()) m_matrixV = jsvd.matrixV();
+    m_singularValues = jsvd.singularValues();
+    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    m_isInitialized = true;
+    return *this;
+  }
+  
  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
  RealScalar scale = matrix.cwiseAbs().maxCoeff();
  if(scale==RealScalar(0)) scale = RealScalar(1);
@ -231,11 +251,13 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
  else               copy = matrix/scale;
  
  //**** step 1 - Bidiagonalization
+  // FIXME this line involves temporaries
  internal::UpperBidiagonalization<MatrixX> bid(copy);

  //**** step 2 - Divide & Conquer
  m_naiveU.setZero();
  m_naiveV.setZero();
+  // FIXME this line involves a temporary matrix
  m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
  m_computed.template bottomRows<1>().setZero();
  divide(0, m_diagSize - 1, 0, 0, 0);
@ -257,6 +279,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
      break;
    }
  }
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
 //   std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
 //   std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
@ -279,14 +302,14 @@ void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const Househol
    Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
    m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
    m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderU.applyThisOnTheLeft(m_matrixU);
+    householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
  }
  if (computeV())
  {
    Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
    m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
    m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderV.applyThisOnTheLeft(m_matrixV);
+    householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
  }
 }

@ -307,7 +330,10 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
    // If the matrices are large enough, let's exploit the sparse structure of A by
    // splitting it in half (wrt n1), and packing the non-zero columns.
    Index n2 = n - n1;
-    MatrixXr A1(n1,n), A2(n2,n), B1(n,n), B2(n,n);
+    Map<MatrixXr> A1(m_workspace.data()      , n1, n);
+    Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data()+  n*n, n,  n);
+    Map<MatrixXr> B2(m_workspace.data()+2*n*n, n,  n);
    Index k1=0, k2=0;
    for(Index j=0; j<n; ++j)
    {
@ -329,7 +355,11 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
  }
  else
-    A *= B;  // FIXME this requires a temporary
+  {
+    Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n);
+    tmp.noalias() = A*B;
+    A = tmp;
+  }
 }

 // The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
@ -360,7 +390,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
  // matrices.
  if (n < m_algoswap)
  {
-    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)) ;
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
    if (m_compU)
      m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
    else 
@ -438,7 +469,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
  } 
  else 
  {
-    RealScalar q1 = (m_naiveU(0, firstCol + k));
+    RealScalar q1 = m_naiveU(0, firstCol + k);
    // we shift Q1 to the right
    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
      m_naiveU(0, i + 1) = m_naiveU(0, i);
@ -491,8 +522,14 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
  assert(VofSVD.allFinite());
 #endif
  
-  if (m_compU)  structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
-  else          m_naiveU.middleCols(firstCol, n + 1) *= UofSVD; // FIXME this requires a temporary, and exploit that there are 2 rows at compile time
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
+  else
+  {
+    Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
  
  if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
  
@ -517,10 +554,9 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
-  // TODO Get rid of these copies (?)
-  // FIXME at least preallocate them
-  ArrayXr col0 = m_computed.col(firstCol).segment(firstCol, n);
-  ArrayXr diag = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
  diag(0) = 0;

  // Allocate space for singular values and vectors
@ -539,13 +575,14 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
  Index actual_n = n;
  while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
  Index m = 0; // size of the deflated problem
-  ArrayXi perm(actual_n);
  for(Index k=0;k<actual_n;++k)
    if(col0(k)!=0)
-      perm(m++) = k;
-  perm.conservativeResize(m);
+      m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(),m);
  
-  ArrayXr shifts(n), mus(n), zhat(n);
+  Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
+  Map<ArrayXr> mus(m_workspace.data()+2*n, n);
+  Map<ArrayXr> zhat(m_workspace.data()+3*n, n);

 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
  std::cout << "computeSVDofM using:\n";
@ -622,8 +659,8 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
  // Reverse order so that singular values in increased order
  // Because of deflation, the zeros singular-values are already at the end
  singVals.head(actual_n).reverseInPlace();
-  U.leftCols(actual_n) = U.leftCols(actual_n).rowwise().reverse().eval();               // FIXME this requires a temporary
-  if (m_compV) V.leftCols(actual_n) = V.leftCols(actual_n).rowwise().reverse().eval();  // FIXME this requires a temporary
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
  
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
@ -634,7 +671,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
 }

 template <typename MatrixType>
-typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift)
+typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
 {
  Index m = perm.size();
  RealScalar res = 1;
@ -647,8 +684,8 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar
 }

 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm,
-                                         VectorType& singVals, ArrayXr& shifts, ArrayXr& mus)
+void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm,
+                                         VectorType& singVals, ArrayRef shifts, ArrayRef mus)
 {
  using std::abs;
  using std::swap;
@ -703,7 +740,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
    RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
    
    // measure everything relative to shift
-    ArrayXr diagShifted = diag - shift;
+    Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
+    diagShifted = diag - shift;
    
    // initial guess
    RealScalar muPrev, muCur;
@ -730,7 +768,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
    // rational interpolation: fit a function of the form a / mu + b through the two previous
    // iterates and use its zero to compute the next iterate
    bool useBisection = fPrev*fCur>0;
-    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
    {
      ++m_numIters;

@ -773,7 +811,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
      }
      
      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif

 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
      if(!(fLeft * fRight<0))
@ -781,14 +822,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
 #endif
      eigen_internal_assert(fLeft * fRight < 0);
      
-      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted)))
+      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
      {
        RealScalar midShifted = (leftShifted + rightShifted) / 2;
        RealScalar fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
        if (fLeft * fMid < 0)
        {
          rightShifted = midShifted;
-          fRight = fMid;
        }
        else
        {
@ -816,8 +856,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
 // zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
 template <typename MatrixType>
 void BDCSVD<MatrixType>::perturbCol0
-   (const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
-    const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat)
+   (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat)
 {
  using std::sqrt;
  Index n = col0.size();
@ -865,8 +905,8 @@ void BDCSVD<MatrixType>::perturbCol0
 // compute singular vectors
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSingVecs
-   (const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
-    const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V)
+   (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V)
 {
  Index n = zhat.size();
  Index m = perm.size();
@ -991,7 +1031,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
  
  RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
  RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
-  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag);
+  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
  
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
  assert(m_naiveU.allFinite());
@ -1047,7 +1087,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
    
    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
    // First, compute the respective permutation.
-    Index *permutation = new Index[length]; // FIXME avoid repeated dynamic memory allocation
+    Index *permutation = m_workspaceI.data();
    {
      permutation[0] = 0;
      Index p = 1;
@ -1084,8 +1124,8 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
    }
    
    // Current index of each col, and current column of each index
-    Index *realInd = new Index[length];  // FIXME avoid repeated dynamic memory allocation
-    Index *realCol = new Index[length];  // FIXME avoid repeated dynamic memory allocation
+    Index *realInd = m_workspaceI.data()+length;
+    Index *realCol = m_workspaceI.data()+2*length;
    
    for(int pos = 0; pos< length; pos++)
    {
@ -1115,9 +1155,6 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
      realInd[J] = realI;
      realInd[i] = pi;
    }
-    delete[] permutation;
-    delete[] realInd;
-    delete[] realCol;
  }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -425,12 +425,13 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
    // If d!=0, then t/d cannot overflow because the magnitude of the
    // entries forming d are not too small compared to the ones forming t.
    RealScalar u = t / d;
-    rot1.s() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.c() = rot1.s() * u;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
  }
  m.applyOnTheLeft(0,1,rot1);
  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
+  *j_left = rot1 * j_right->transpose();
 }

 template<typename _MatrixType, int QRPreconditioner> 
@ -680,6 +681,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();

  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  // FIXME What about considerering any denormal numbers as zero, using:
+  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();

  // Scaling factor to reduce over/under-flows
@ -719,8 +722,9 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
        // if this 2x2 sub-matrix is not diagonal already...
        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        RealScalar threshold = numext::maxi(considerAsZero, precision * numext::maxi(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
+                   precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
+                                                        abs(m_workMatrix.coeff(q,q))));
        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
        if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
        {
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@ -30,16 +30,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
  
  std::memset(mask,0,sizeof(bool)*rows);

+  typename evaluator<Lhs>::type lhsEval(lhs);
+  typename evaluator<Rhs>::type rhsEval(rhs);
+  
  // estimate the number of non zero entries
  // given a rhs column containing Y non zeros, we assume that the respective Y columns
  // of the lhs differs in average of one non zeros, thus the number of non zeros for
  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
  // per column of the lhs.
  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-  
-  typename evaluator<Lhs>::type lhsEval(lhs);
-  typename evaluator<Rhs>::type rhsEval(rhs);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();

  res.setZero();
  res.reserve(Index(estimated_nnz_prod));
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -90,7 +90,8 @@ class sparse_matrix_block_impl
    typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
 public:
    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
    typedef typename Base::IndexVector IndexVector;
    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
@ -198,20 +199,9 @@ public:
    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
    
    inline const StorageIndex* innerNonZeroPtr() const
-    { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
    inline StorageIndex* innerNonZeroPtr()
-    { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
-
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  (  (m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-                 - (m_matrix.outerIndexPtr()[m_outerStart]));
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
+    { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
    
    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
    
@ -233,7 +223,7 @@ public:
    const Scalar& lastCoeff() const
    {
      EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
-      eigen_assert(nonZeros()>0);
+      eigen_assert(Base::nonZeros()>0);
      if(m_matrix.isCompressed())
        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
      else
@ -339,17 +329,6 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
  
 }

-namespace internal {
-  
-template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool OuterVector =  (BlockCols==1 && XprType::IsRowMajor)
-                               | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                 // revert to || as soon as not needed anymore. 
-                              (BlockRows==1 && !XprType::IsRowMajor)>
-class GenericSparseBlockInnerIteratorImpl;
-
-}
-
 /** Generic implementation of sparse Block expression.
  * Real-only. 
  */
@ -415,8 +394,11 @@ public:
    Index blockCols() const { return m_blockCols.value(); }
    
  protected:
-    friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+//     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
    friend class ReverseInnerIterator;
+    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
+    
+    Index nonZeros() const { return Dynamic; }
    
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)

@ -429,94 +411,6 @@ public:
 };

 namespace internal {
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,false> : public Block<XprType, BlockRows, BlockCols, InnerPanel>::_MatrixTypeNested::InnerIterator
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::StorageIndex StorageIndex;
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    const BlockType& m_block;
-    Index m_end;
-  public:
-
-    EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer)
-      : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-        m_block(block),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-    {
-      while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-        Base::operator++();
-    }
-    
-    inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-    inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-    inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-    
-    inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-  };
-  
-  // Row vector of a column-major sparse matrix or column of a row-major one.
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,true>
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::StorageIndex StorageIndex;
-    typedef typename BlockType::Scalar Scalar;
-    const BlockType& m_block;
-    Index m_outerPos;
-    Index m_innerIndex;
-    Scalar m_value;
-    Index m_end;
-  public:
-
-    explicit EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer = 0)
-      : 
-        m_block(block),
-        m_outerPos( (IsRowMajor ? block.m_startCol.value() : block.m_startRow.value()) - 1), // -1 so that operator++ finds the first non-zero entry
-        m_innerIndex(IsRowMajor ? block.m_startRow.value() : block.m_startCol.value()),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-      
-      ++(*this);
-    }
-    
-    inline Index index()  const { return m_outerPos - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return 0; }
-    inline Index row()    const { return IsRowMajor ? 0 : index(); }
-    inline Index col()    const { return IsRowMajor ? index() : 0; }
-    
-    inline Scalar value() const { return m_value; }
-    
-    inline GenericSparseBlockInnerIteratorImpl& operator++()
-    {
-      // search next non-zero entry
-      while(++m_outerPos<m_end)
-      {
-        typename XprType::InnerIterator it(m_block.m_matrix, m_outerPos);
-        // search for the key m_innerIndex in the current outer-vector
-        while(it && it.index() < m_innerIndex) ++it;
-        if(it && it.index()==m_innerIndex)
-        {
-          m_value = it.value();
-          break;
-        }
-      }
-      return *this;
-    }
-    
-    inline operator bool() const { return m_outerPos < m_end; }
-  };

 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
@ -548,9 +442,16 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
    explicit unary_evaluator(const XprType& op)
      : m_argImpl(op.nestedExpression()), m_block(op)
    {}
+    
+    inline Index nonZerosEstimate() const {
+      Index nnz = m_block.nonZeros();
+      if(nnz<0)
+        return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+      return nnz;
+    }

  protected:
-    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
    
    typename evaluator<ArgType>::nestedType m_argImpl;
    const XprType &m_block;
@ -595,6 +496,7 @@ public:
    : m_eval(aEval),
      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_value(0),
      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
  {
    EIGEN_UNUSED_VARIABLE(outer);
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@ -35,6 +35,25 @@ class SparseCompressedBase
    class InnerIterator;
    class ReverseInnerIterator;
    
+  protected:
+    typedef typename Base::IndexVector IndexVector;
+    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+        
+  public:
+    
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(isCompressed())
+        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
+      else if(derived().outerSize()==0)
+        return 0;
+      else
+        return innerNonZeros().sum();
+      
+    }
+    
    /** \returns a const pointer to the array of values.
      * This function is aimed at interoperability with other libraries.
      * \sa innerIndexPtr(), outerIndexPtr() */
@ -165,6 +184,10 @@ struct evaluator<SparseCompressedBase<Derived> >
  evaluator() : m_matrix(0) {}
  explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
  operator Derived&() { return m_matrix->const_cast_derived(); }
  operator const Derived&() const { return *m_matrix; }
  
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@ -121,6 +121,10 @@ public:
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
  { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
+  }

 protected:
  const BinaryOp m_functor;
@ -198,6 +202,10 @@ public:
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
  { }
+  
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }

 protected:
  const BinaryOp m_functor;
@ -243,7 +251,7 @@ public:
    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }

    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
-
+    
  protected:
    const LhsEvaluator &m_lhsEval;
    RhsIterator m_rhsIter;
@ -262,6 +270,10 @@ public:
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
  { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_rhsImpl.nonZerosEstimate();
+  }

 protected:
  const BinaryOp m_functor;
@ -308,7 +320,7 @@ public:
    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }

    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
-
+    
  protected:
    LhsIterator m_lhsIter;
    const RhsEvaluator &m_rhsEval;
@ -327,6 +339,10 @@ public:
      m_lhsImpl(xpr.lhs()), 
      m_rhsImpl(xpr.rhs())  
  { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate();
+  }

 protected:
  const BinaryOp m_functor;
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@ -30,6 +30,10 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
    };
    
    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {}
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }

  protected:
    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@ -105,9 +105,6 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
    }

-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@ -95,6 +95,7 @@ class SparseMatrix
  public:
    typedef SparseCompressedBase<SparseMatrix> Base;
    using Base::isCompressed;
+    using Base::nonZeros;
    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
@ -122,9 +123,6 @@ class SparseMatrix
    StorageIndex* m_outerIndex;
    StorageIndex* m_innerNonZeros;     // optional, if null then the data is compressed
    Storage m_data;
-    
-    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }

  public:
    
@ -252,14 +250,6 @@ class SparseMatrix
        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
    }

-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
-      if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return convert_index(Index(m_data.size()));
-    }
-
    /** Preallocates \a reserveSize non zeros.
      *
      * Precondition: the matrix must be in compressed mode. */
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@ -149,9 +149,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
    /** \returns the number of coefficients, which is \a rows()*cols().
      * \sa rows(), cols(). */
    inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
    /** \returns true if either the number of rows or the number of columns is equal to 1.
      * In other words, this function returns
      * \code rows()==1 || cols()==1 \endcode
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@ -33,14 +33,6 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
  // allocate a temporary buffer
  AmbiVector<Scalar,StorageIndex> tempVector(rows);

-  // estimate the number of non zero entries
-  // given a rhs column containing Y non zeros, we assume that the respective Y columns
-  // of the lhs differs in average of one non zeros, thus the number of non zeros for
-  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
-  // per column of the lhs.
-  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-
  // mimics a resizeByInnerOuter:
  if(ResultType::IsRowMajor)
    res.resize(cols, rows);
@ -49,6 +41,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
  
  typename evaluator<Lhs>::type lhsEval(lhs);
  typename evaluator<Rhs>::type rhsEval(rhs);
+  
+  // estimate the number of non zero entries
+  // given a rhs column containing Y non zeros, we assume that the respective Y columns
+  // of the lhs differs in average of one non zeros, thus the number of non zeros for
+  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
+  // per column of the lhs.
+  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();

  res.reserve(estimated_nnz_prod);
  double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@ -40,15 +40,11 @@ namespace internal {
  };
 }
  
-// Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
-// Perhaps it should be implemented in Transpose<> itself.
 template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
  : public internal::SparseTransposeImpl<MatrixType>
 {
  protected:
    typedef internal::SparseTransposeImpl<MatrixType> Base;
-  public:
-    inline Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
 };

 namespace internal {
@ -61,6 +57,10 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased>
    typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
  public:
    typedef Transpose<ArgType> XprType;
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }

    class InnerIterator : public EvalIterator
    {
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@ -50,13 +50,6 @@ protected:

    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
-
-    inline Index nonZeros() const {
-      // FIXME HACK number of nonZeros is required for product logic
-      // this returns only an upper bound (but should be OK for most purposes)
-      return derived().nestedExpression().nonZeros();
-    }
-
  
 };

@ -191,6 +184,10 @@ public:
    
  explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {}
  
+  inline Index nonZerosEstimate() const {
+    return m_argImpl.nonZerosEstimate();
+  }
+  
  class InnerIterator : public EvalIterator
  {
      typedef EvalIterator Base;
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@ -442,6 +442,10 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
  
  explicit evaluator(const SparseVectorType &mat) : m_matrix(mat) {}
  
+  inline Index nonZerosEstimate() const {
+    return m_matrix.nonZeros();
+  }
+  
  operator SparseVectorType&() { return m_matrix.const_cast_derived(); }
  operator const SparseVectorType&() const { return m_matrix; }
  
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@ -302,6 +302,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
    typedef Matrix<Scalar,Dynamic,1> Vector;
    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
    typedef SparseMatrix<Scalar> LUMatrixType;

  public:
@ -459,10 +460,11 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
    typedef typename Base::RealScalar RealScalar;
    typedef typename Base::StorageIndex StorageIndex;
    typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::IntColVectorType IntColVectorType;   
+    typedef typename Base::PermutationMap PermutationMap;
    typedef typename Base::LUMatrixType LUMatrixType;
    typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>          UMatrixType;

  public:
    using Base::_solve_impl;
@ -774,6 +776,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
        det *= m_u.valuePtr()[lastId];
    }
  }
+  if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
+    det = -det;
  if(m_sluEqued!='N')
    return det/m_sluRscale.prod()/m_sluCscale.prod();
  else
--- a/cmake/FindMetis.cmake
+++ b/cmake/FindMetis.cmake
@ -26,7 +26,7 @@ macro(_metis_check_version)
  string(REGEX MATCH "define[ \t]+METIS_VER_SUBMINOR[ \t]+([0-9]+)" _metis_subminor_version_match "${_metis_version_header}")
  set(METIS_SUBMINOR_VERSION "${CMAKE_MATCH_1}")
  if(NOT METIS_MAJOR_VERSION)
-    message(WARNING "Could not determine Metis version. Assuming version 4.0.0")
+    message(STATUS "Could not determine Metis version. Assuming version 4.0.0")
    set(METIS_VERSION 4.0.0)
  else()
    set(METIS_VERSION ${METIS_MAJOR_VERSION}.${METIS_MINOR_VERSION}.${METIS_SUBMINOR_VERSION})
--- a/doc/special_examples/CMakeLists.txt
+++ b/doc/special_examples/CMakeLists.txt
@ -10,9 +10,10 @@ if(QT4_FOUND)
  target_link_libraries(Tutorial_sparse_example ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${QT_QTCORE_LIBRARY} ${QT_QTGUI_LIBRARY})

  add_custom_command(
-      TARGET Tutorial_sparse_example
-      POST_BUILD
-      COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg
+    TARGET Tutorial_sparse_example
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/../html/
+    COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg
  )

  add_dependencies(all_examples Tutorial_sparse_example)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -332,3 +332,8 @@ endif(EIGEN_TEST_NVCC)

 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)    
 add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
+
+option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF)
+IF(EIGEN_TEST_BUILD_DOCUMENTATION)
+  add_dependencies(buildtests doc)
+ENDIF()
--- a/test/array_reverse.cpp
+++ b/test/array_reverse.cpp
@ -24,7 +24,7 @@ template<typename MatrixType> void reverse(const MatrixType& m)

  // this test relies a lot on Random.h, and there's not much more that we can do
  // to test it, hence I consider that we will have tested Random.h
-  MatrixType m1 = MatrixType::Random(rows, cols);
+  MatrixType m1 = MatrixType::Random(rows, cols), m2;
  VectorType v1 = VectorType::Random(rows);

  MatrixType m1_r = m1.reverse();
@ -96,6 +96,26 @@ template<typename MatrixType> void reverse(const MatrixType& m)

  m1.reverse()(r, c) = x;
  VERIFY_IS_APPROX(x, m1(rows - 1 - r, cols - 1 - c));
+  
+  m2 = m1;
+  m2.reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.reverse().eval());
+  
+  m2 = m1;
+  m2.col(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.col(0),m1.col(0).reverse().eval());
+  
+  m2 = m1;
+  m2.row(0).reverseInPlace();
+  VERIFY_IS_APPROX(m2.row(0),m1.row(0).reverse().eval());
+  
+  m2 = m1;
+  m2.rowwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.rowwise().reverse().eval());
+  
+  m2 = m1;
+  m2.colwise().reverseInPlace();
+  VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval());

  /*
  m1.colwise().reverse()(r, c) = x;
@ -113,11 +133,11 @@ void test_array_reverse()
    CALL_SUBTEST_2( reverse(Matrix2f()) );
    CALL_SUBTEST_3( reverse(Matrix4f()) );
    CALL_SUBTEST_4( reverse(Matrix4d()) );
-    CALL_SUBTEST_5( reverse(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_6( reverse(MatrixXi(6, 3)) );
-    CALL_SUBTEST_7( reverse(MatrixXcd(20, 20)) );
+    CALL_SUBTEST_5( reverse(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( reverse(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
-    CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(6,3)) );
+    CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
  }
 #ifdef EIGEN_TEST_PART_3
  Vector4f x; x << 1, 2, 3, 4;
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@ -17,6 +17,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
  typedef Matrix<Scalar, Rows, 1> VectorType;
  typedef Matrix<Scalar, 1, Cols> RowVectorType;
  typedef Matrix<Scalar, Rows, Rows> SquareMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
  typedef DiagonalMatrix<Scalar, Rows> LeftDiagonalMatrix;
  typedef DiagonalMatrix<Scalar, Cols> RightDiagonalMatrix;
  typedef Matrix<Scalar, Rows==Dynamic?Dynamic:2*Rows, Cols==Dynamic?Dynamic:2*Cols> BigMatrix;
@ -64,6 +65,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
  VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j))  , (v1+v2)(i) * (m1+m2)(i,j) );
  VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * m1(i,j) );
  VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j))  , (rv1+rv2)(j) * (m1+m2)(i,j) );
+  
+  if(rows>1)
+  {
+    DynMatrixType tmp = m1.topRows(rows/2), res;
+    VERIFY_IS_APPROX( (res = m1.topRows(rows/2) * rv1.asDiagonal()), tmp * rv1.asDiagonal() );
+    VERIFY_IS_APPROX( (res = v1.head(rows/2).asDiagonal()*m1.topRows(rows/2)), v1.head(rows/2).asDiagonal()*tmp );
+  }

  BigMatrix big;
  big.setZero(2*rows, 2*cols);
@ -93,6 +101,17 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
  VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
 }

+template<int>
+void bug987()
+{
+  Matrix3Xd points = Matrix3Xd::Random(3, 3);
+  Vector2d diag = Vector2d::Random();
+  Matrix2Xd tmp1 = points.topRows<2>(), res1, res2;
+  VERIFY_IS_APPROX( res1 = diag.asDiagonal() * points.topRows<2>(), res2 = diag.asDiagonal() * tmp1 );
+  Matrix2d tmp2 = points.topLeftCorner<2,2>();
+  VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
+}
+
 void test_diagonalmatrices()
 {
  for(int i = 0; i < g_repeat; i++) {
@ -106,4 +125,5 @@ void test_diagonalmatrices()
    CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
  }
+  CALL_SUBTEST_10( bug987<0>() );
 }
--- a/test/main.h
+++ b/test/main.h
@ -95,6 +95,9 @@
 namespace Eigen
 {
  static std::vector<std::string> g_test_stack;
+  // level == 0 <=> abort if test fail
+  // level >= 1 <=> warning message to std::cerr if test fail
+  static int g_test_level = 0;
  static int g_repeat;
  static unsigned int g_seed;
  static bool g_has_set_repeat, g_has_set_seed;
@ -229,6 +232,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 {
  if (!condition)
  {
+    if(Eigen::g_test_level>0)
+      std::cerr << "WARNING: ";
    std::cerr << "Test " << testname << " failed in " << file << " (" << line << ")"
      << std::endl << "    " << condition_as_string << std::endl;
    std::cerr << "Stack:\n";
@ -236,7 +241,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
    for(int i=test_stack_size-1; i>=0; --i)
      std::cerr << "  - " << Eigen::g_test_stack[i] << "\n";
    std::cerr << "\n";
-    abort();
+    if(Eigen::g_test_level==0)
+      abort();
  }
 }

--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@ -113,6 +113,9 @@ void mat_mat_scalar_scalar_product()
 template <typename MatrixType> 
 void zero_sized_objects(const MatrixType& m)
 {
+  typedef typename MatrixType::Scalar Scalar;
+  const int PacketSize  = internal::packet_traits<Scalar>::size;
+  const int PacketSize1 = PacketSize>1 ?  PacketSize-1 : 1;
  Index rows = m.rows();
  Index cols = m.cols();
  
@ -132,9 +135,41 @@ void zero_sized_objects(const MatrixType& m)
    res = b*a;
    VERIFY(res.rows()==0 && res.cols()==cols);
  }
+  
+  {
+    Matrix<Scalar,PacketSize,0> a;
+    Matrix<Scalar,0,1> b;
+    Matrix<Scalar,PacketSize,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize1,0> a;
+    Matrix<Scalar,0,1> b;
+    Matrix<Scalar,PacketSize1,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize,Dynamic> a(PacketSize,0);
+    Matrix<Scalar,Dynamic,1> b(0,1);
+    Matrix<Scalar,PacketSize,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
+  }
+  
+  {
+    Matrix<Scalar,PacketSize1,Dynamic> a(PacketSize1,0);
+    Matrix<Scalar,Dynamic,1> b(0,1);
+    Matrix<Scalar,PacketSize1,1> res;
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
+    VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
+  }
 }

-
+template<int>
 void bug_127()
 {
  // Bug 127
@ -159,6 +194,7 @@ void bug_127()
  a*b;
 }

+template<int>
 void unaligned_objects()
 {
  // Regression test for the bug reported here:
@ -188,6 +224,29 @@ void unaligned_objects()
  }
 }

+template<typename T>
+EIGEN_DONT_INLINE
+Index test_compute_block_size(Index m, Index n, Index k)
+{
+  Index mc(m), nc(n), kc(k);
+  internal::computeProductBlockingSizes<T,T>(kc, mc, nc);
+  return kc+mc+nc;
+}
+
+template<typename T>
+Index compute_block_size()
+{
+  Index ret = 0;
+  ret += test_compute_block_size<T>(0,1,1);
+  ret += test_compute_block_size<T>(1,0,1);
+  ret += test_compute_block_size<T>(1,1,0);
+  ret += test_compute_block_size<T>(0,0,1);
+  ret += test_compute_block_size<T>(0,1,0);
+  ret += test_compute_block_size<T>(1,0,0);
+  ret += test_compute_block_size<T>(0,0,0);
+  return ret;
+}
+
 void test_product_extra()
 {
  for(int i = 0; i < g_repeat; i++) {
@ -198,6 +257,9 @@ void test_product_extra()
    CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
    CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
  }
-  CALL_SUBTEST_5( bug_127() );
-  CALL_SUBTEST_6( unaligned_objects() );
+  CALL_SUBTEST_5( bug_127<0>() );
+  CALL_SUBTEST_6( unaligned_objects<0>() );
+  CALL_SUBTEST_7( compute_block_size<float>() );
+  CALL_SUBTEST_7( compute_block_size<double>() );
+  CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
 }
--- a/test/real_qz.cpp
+++ b/test/real_qz.cpp
@ -25,6 +25,22 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
  MatrixType A = MatrixType::Random(dim,dim),
             B = MatrixType::Random(dim,dim);

+
+  // Regression test for bug 985: Randomly set rows or columns to zero
+  Index k=internal::random<Index>(0, dim-1);
+  switch(internal::random<int>(0,10)) {
+  case 0:
+    A.row(k).setZero(); break;
+  case 1:
+    A.col(k).setZero(); break;
+  case 2:
+    B.row(k).setZero(); break;
+  case 3:
+    B.col(k).setZero(); break;
+  default:
+    break;
+  }
+
  RealQZ<MatrixType> qz(A,B);
  
  VERIFY_IS_EQUAL(qz.info(), Success);
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@ -67,6 +67,9 @@ template<typename SparseMatrixType> void sparse_product()
    VERIFY_IS_APPROX(m4 = m2*m3/s1, refMat4 = refMat2*refMat3/s1);
    VERIFY_IS_APPROX(m4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
    VERIFY_IS_APPROX(m4 = s2*m2*m3*s1, refMat4 = s2*refMat2*refMat3*s1);
+    VERIFY_IS_APPROX(m4 = (m2+m2)*m3, refMat4 = (refMat2+refMat2)*refMat3);
+    VERIFY_IS_APPROX(m4 = m2*m3.leftCols(cols/2), refMat4 = refMat2*refMat3.leftCols(cols/2));
+    VERIFY_IS_APPROX(m4 = m2*(m3+m3).leftCols(cols/2), refMat4 = refMat2*(refMat3+refMat3).leftCols(cols/2));

    VERIFY_IS_APPROX(m4=(m2*m3).pruned(0), refMat4=refMat2*refMat3);
    VERIFY_IS_APPROX(m4=(m2t.transpose()*m3).pruned(0), refMat4=refMat2t.transpose()*refMat3);
@ -194,7 +197,7 @@ template<typename SparseMatrixType> void sparse_product()
    VERIFY_IS_APPROX(d3=d1*m2.transpose(), refM3=d1*refM2.transpose());
  }

-  // test self-adjoint and traingular-view products
+  // test self-adjoint and triangular-view products
  {
    DenseMatrix b = DenseMatrix::Random(rows, rows);
    DenseMatrix x = DenseMatrix::Random(rows, rows);
--- a/test/svd_common.h
+++ b/test/svd_common.h
@ -49,18 +49,39 @@ void svd_compare_to_full(const MatrixType& m,
                         unsigned int computationOptions,
                         const SvdType& referenceSvd)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::RealScalar RealScalar;
  Index rows = m.rows();
  Index cols = m.cols();
  Index diagSize = (std::min)(rows, cols);
+  RealScalar prec = test_precision<RealScalar>();

  SvdType svd(m, computationOptions);

  VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
+  
+  if(computationOptions & (ComputeFullV|ComputeThinV))
+  {
+    VERIFY( (svd.matrixV().adjoint()*svd.matrixV()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixV().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixV().leftCols(diagSize) * referenceSvd.singularValues().asDiagonal() * referenceSvd.matrixV().leftCols(diagSize).adjoint());
+  }
+  
+  if(computationOptions & (ComputeFullU|ComputeThinU))
+  {
+    VERIFY( (svd.matrixU().adjoint()*svd.matrixU()).isIdentity(prec) );
+    VERIFY_IS_APPROX( svd.matrixU().leftCols(diagSize) * svd.singularValues().cwiseAbs2().asDiagonal() * svd.matrixU().leftCols(diagSize).adjoint(),
+                      referenceSvd.matrixU().leftCols(diagSize) * referenceSvd.singularValues().cwiseAbs2().asDiagonal() * referenceSvd.matrixU().leftCols(diagSize).adjoint());
+  }
+  
+  // The following checks are not critical.
+  // For instance, with Dived&Conquer SVD, if only the factor 'V' is computedt then different matrix-matrix product implementation will be used
+  // and the resulting 'V' factor might be significantly different when the SVD decomposition is not unique, especially with single precision float.
+  ++g_test_level;
  if(computationOptions & ComputeFullU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
  if(computationOptions & ComputeThinU)  VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
-  if(computationOptions & ComputeFullV)  VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV());
+  if(computationOptions & ComputeFullV)  VERIFY_IS_APPROX(svd.matrixV().cwiseAbs(), referenceSvd.matrixV().cwiseAbs());
  if(computationOptions & ComputeThinV)  VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
+  --g_test_level;
 }

 //
@ -85,33 +106,48 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions)
  SvdType svd(m, computationOptions);

       if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8);
-  else if(internal::is_same<RealScalar,float>::value)  svd.setThreshold(1e-4);
-  
+  else if(internal::is_same<RealScalar,float>::value)  svd.setThreshold(2e-4);
+
  SolutionType x = svd.solve(rhs);
-  
-  // evaluate normal equation which works also for least-squares solutions
-  if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
-  {
-    // This test is not stable with single precision.
-    // This is probably because squaring m signicantly affects the precision.
-    VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
-  }
-  
+   
  RealScalar residual = (m*x-rhs).norm();
-  // Check that there is no significantly better solution in the neighborhood of x
+  RealScalar rhs_norm = rhs.norm();
  if(!test_isMuchSmallerThan(residual,rhs.norm()))
  {
    // ^^^ If the residual is very small, then we have an exact solution, so we are already good.
+    
+    // evaluate normal equation which works also for least-squares solutions
+    if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
+    {
+      using std::sqrt;
+      // This test is not stable with single precision.
+      // This is probably because squaring m signicantly affects the precision.      
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
+      
+      VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
+      
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
+    }
+    
+    // Check that there is no significantly better solution in the neighborhood of x
    for(Index k=0;k<x.rows();++k)
    {
+      using std::abs;
+      
      SolutionType y(x);
      y.row(k) = (1.+2*NumTraits<RealScalar>::epsilon())*x.row(k);
      RealScalar residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
      
      y.row(k) = (1.-2*NumTraits<RealScalar>::epsilon())*x.row(k);
      residual_y = (m*y-rhs).norm();
+      VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) ++g_test_level;
      VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
+      if(internal::is_same<RealScalar,float>::value) --g_test_level;
    }
  }
 }
--- a/test/swap.cpp
+++ b/test/swap.cpp
@ -82,8 +82,10 @@ template<typename MatrixType> void swap(const MatrixType& m)

 void test_swap()
 {
+  int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
  CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
  CALL_SUBTEST_2( swap(Matrix4d()) ); // fixed size, possible vectorization 
-  CALL_SUBTEST_3( swap(MatrixXd(3,3)) ); // dyn size, no vectorization 
-  CALL_SUBTEST_4( swap(MatrixXf(30,30)) ); // dyn size, possible vectorization 
+  CALL_SUBTEST_3( swap(MatrixXd(s,s)) ); // dyn size, no vectorization 
+  CALL_SUBTEST_4( swap(MatrixXf(s,s)) ); // dyn size, possible vectorization 
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@ -9,7 +9,17 @@

 #include "main.h"

-typedef Matrix<float,8,1> Vector8f;
+typedef Matrix<float,  6,1> Vector6f;
+typedef Matrix<float,  8,1> Vector8f;
+typedef Matrix<float, 12,1> Vector12f;
+
+typedef Matrix<double, 5,1> Vector5d;
+typedef Matrix<double, 6,1> Vector6d;
+typedef Matrix<double, 7,1> Vector7d;
+typedef Matrix<double, 8,1> Vector8d;
+typedef Matrix<double, 9,1> Vector9d;
+typedef Matrix<double,10,1> Vector10d;
+typedef Matrix<double,12,1> Vector12d;

 struct TestNew1
 {
@ -85,6 +95,9 @@ void unalignedassert()
  construct_at_boundary<Vector2f>(4);
  construct_at_boundary<Vector3f>(4);
  construct_at_boundary<Vector4f>(16);
+  construct_at_boundary<Vector6f>(4);
+  construct_at_boundary<Vector8f>(EIGEN_ALIGN_BYTES);
+  construct_at_boundary<Vector12f>(16);
  construct_at_boundary<Matrix2f>(16);
  construct_at_boundary<Matrix3f>(4);
  construct_at_boundary<Matrix4f>(EIGEN_ALIGN_BYTES);
@ -92,6 +105,13 @@ void unalignedassert()
  construct_at_boundary<Vector2d>(16);
  construct_at_boundary<Vector3d>(4);
  construct_at_boundary<Vector4d>(EIGEN_ALIGN_BYTES);
+  construct_at_boundary<Vector5d>(4);
+  construct_at_boundary<Vector6d>(16);
+  construct_at_boundary<Vector7d>(4);
+  construct_at_boundary<Vector8d>(EIGEN_ALIGN_BYTES);
+  construct_at_boundary<Vector9d>(4);
+  construct_at_boundary<Vector10d>(16);
+  construct_at_boundary<Vector12d>(EIGEN_ALIGN_BYTES);
  construct_at_boundary<Matrix2d>(EIGEN_ALIGN_BYTES);
  construct_at_boundary<Matrix3d>(4);
  construct_at_boundary<Matrix4d>(EIGEN_ALIGN_BYTES);
@ -115,7 +135,14 @@ void unalignedassert()
  if(EIGEN_ALIGN_BYTES>=16)
  {
    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8));
    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
  }
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -81,8 +81,8 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"

-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"

 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@ -266,16 +266,16 @@ array<t, n> repeat(t v) {
 }

 template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
  return get<I, type_list<Head, Tail> >::value;
 }
 template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
  return get<I, type_list<Head, Tail> >::value;
 }

 template <class NList>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
  return arg_prod<NList>::value;
 }

--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@ -1157,7 +1157,41 @@ in TensorFunctors.h for information on how to implement a reduction operator.

 ## Convolutions

-TBD: convolve(const KernelDerived& kernel, const Dimensions& dims)
+### &lt;Operation&gt; convolve(const Kernel& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
+for the last dimension).
+
+    // Compute convolution along the second and third dimension.
+    Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+    Tensor<float, 2, DataLayout> kernel(2, 2);
+    Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+    input.setRandom();
+    kernel.setRandom();
+
+    Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+    output = input.convolve(kernel, dims);
+
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 6; ++k) {
+          for (int l = 0; l < 11; ++l) {
+            const float result = output(i,j,k,l);
+            const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                   input(i,j+1,k+0,l) * kernel(1,0) +
+                                   input(i,j+0,k+1,l) * kernel(0,1) +
+                                   input(i,j+1,k+1,l) * kernel(1,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }


 ## Geometrical Operations
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -21,8 +21,8 @@ namespace Eigen {
  */
 namespace internal {

-
-template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
+template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+class IndexMapper {
 public:
  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
              const array<Index, NumKernelDims>& indices) {
@ -38,13 +38,19 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM

    array<Index, NumDims> inputStrides;
    array<Index, NumDims> outputStrides;
-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
-      } else {
-        inputStrides[0] = 1;
-        outputStrides[0] = 1;
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
      }
    }

@ -52,13 +58,20 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM
    array<Index, NumDims> cudaOutputDimensions;
    array<Index, NumDims> tmp = dimensions;
    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
    for (int i = 0; i < NumKernelDims; ++i) {
-      ordering[i] = indices[i];
+      const Index index = i + offset;
+      ordering[index] = indices[i];
      tmp[indices[i]] = -1;
-      cudaInputDimensions[i] = input_dims[ordering[i]];
-      cudaOutputDimensions[i] = dimensions[ordering[i]];
+      cudaInputDimensions[index] = input_dims[indices[i]];
+      cudaOutputDimensions[index] = dimensions[indices[i]];
    }
-    int written = NumKernelDims;
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                      ? NumKernelDims
+                      : 0;
    for (int i = 0; i < NumDims; ++i) {
      if (tmp[i] >= 0) {
        ordering[written] = i;
@ -73,61 +86,123 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM
      m_outputStrides[i] = outputStrides[ordering[i]];
    }

-    for (int i = 0; i < NumDims; ++i) {
-      if (i > NumKernelDims) {
-        m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
-        m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
-      } else {
-        m_cudaInputStrides[i] = 1;
-        m_cudaOutputStrides[i] = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (i + 1 < offset) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        } else {
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
+        }
      }
    }
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
    Index inputIndex = 0;
-    for (int d = NumDims - 1; d > NumKernelDims; --d) {
-      const Index idx = p / m_cudaInputStrides[d];
-      inputIndex += idx * m_inputStrides[d];
-      p -= idx * m_cudaInputStrides[d];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[NumKernelDims];
+    } else {
+      int limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
    }
-    inputIndex += p * m_inputStrides[NumKernelDims];
    return inputIndex;
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
    Index outputIndex = 0;
-    for (int d = NumDims - 1; d > NumKernelDims; --d) {
-      const Index idx = p / m_cudaOutputStrides[d];
-      outputIndex += idx * m_outputStrides[d];
-      p -= idx * m_cudaOutputStrides[d];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[NumKernelDims];
+    } else {
+      int limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_cudaOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
    }
-    outputIndex += p * m_outputStrides[NumKernelDims];
    return outputIndex;
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
-    return i * m_inputStrides[0];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
-    return i * m_outputStrides[0];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
-    return i * m_inputStrides[0] + j*m_inputStrides[1];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
-    return i * m_outputStrides[0] + j * m_outputStrides[1];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
-    return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+           k * m_inputStrides[offset + 2];
  }

  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
-    return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? 0
+                              : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+           k * m_outputStrides[offset + 2];
  }

 private:
@ -237,35 +312,61 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
  {
    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    // Only column major tensors are supported for now.
-    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);

    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();

-    m_inputStride[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
    }

    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-      if (i > 0) {
-        m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
-      } else {
-        m_kernelStride[0] = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
      }
-      m_indexStride[i] = m_inputStride[index];
-    }

-    m_outputStride[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
    }
  }

@ -310,13 +411,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    Index indices[2] = {index, index+PacketSize-1};
    Index startInputs[2] = {0, 0};
-    for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx0 = indices[0] / m_outputStride[i];
-      const Index idx1 = indices[1] / m_outputStride[i];
-      startInputs[0] += idx0 * m_inputStride[i];
-      startInputs[1] += idx1 * m_inputStride[i];
-      indices[0] -= idx0 * m_outputStride[i];
-      indices[1] -= idx1 * m_outputStride[i];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
    }
    startInputs[0] += indices[0];
    startInputs[1] += indices[1];
@ -344,10 +456,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 private:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
    Index startInput = 0;
-    for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStride[i];
-      startInput += idx * m_inputStride[i];
-      index -= idx * m_outputStride[i];
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
    }
    startInput += index;
    return startInput;
@ -378,7 +498,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    }
  }

-  EIGEN_STRONG_INLINE void preloadKernel() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
    // Don't make a local copy of the kernel unless we have to (i.e. it's an
    // expression that needs to be evaluated)
    const Scalar* in_place = m_kernelImpl.data();
@ -431,11 +551,14 @@ struct GetKernelSize<Dynamic> {
  }
 };

-
-
-
-template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int kernelSize, float* buffer) {
  extern __shared__ float s[];

  const int first_x = blockIdx.x * maxX;
@ -453,7 +576,7 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In
    #pragma unroll
    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
-      s[i + plane_kernel_offset] =  eval.coeff(tensor_index);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
    }

    __syncthreads();
@ -476,9 +599,15 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In
  }
 };

-
-template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
+template <typename InputEvaluator, typename Index, typename InputDims,
+          int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX,
+    const int maxX, const int numY, const int maxY, const int kernelSizeX,
+    const int kernelSizeY, float* buffer) {
  extern __shared__ float s[];

  const int first_x = blockIdx.x * maxX;
@ -538,9 +667,15 @@ __global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::In
  }
 };

-
 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
+__global__ void EigenConvolutionKernel3D(
+    InputEvaluator eval,
+    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+        indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
  extern __shared__ float s[];

  // Load inputs to shared memory
@ -622,8 +757,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
  {
    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    // Only column major tensors are supported for now.
-    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);

    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
@ -712,10 +845,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

        const int numX = dimensions()[m_indices[0]];
        const int numP = dimensions().TotalSize() / numX;
-
        int maxX;
        dim3 block_size;
-        if (m_indices[0] == 0) {
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 0
+                : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
          // Maximum the reuse
          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
          maxX = (std::min<int>)(inner_dim, numX);
@ -747,7 +884,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

        const array<Index, 1> indices(m_indices[0]);
        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
-        internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
        switch(kernel_size) {
          case 4: {
            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
@ -765,11 +903,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      }

      case 2: {
-        const int kernel_size_x = m_kernelImpl.dimensions()[0];
-        const int kernel_size_y = m_kernelImpl.dimensions()[1];
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];

-        const int numX = dimensions()[m_indices[0]];
-        const int numY = dimensions()[m_indices[1]];
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
        const int numP = dimensions().TotalSize() / (numX*numY);

        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
@ -798,9 +940,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;

-        const array<Index, 2> indices(m_indices[0], m_indices[1]);
-        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
-        internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY]);
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);
        switch (kernel_size_x) {
          case 4: {
            switch (kernel_size_y) {
@ -837,13 +981,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      }

      case 3: {
-        const int kernel_size_x = m_kernelImpl.dimensions()[0];
-        const int kernel_size_y = m_kernelImpl.dimensions()[1];
-        const int kernel_size_z = m_kernelImpl.dimensions()[2];
+        const int idxX =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;

-        const int numX = dimensions()[m_indices[0]];
-        const int numY = dimensions()[m_indices[1]];
-        const int numZ = dimensions()[m_indices[2]];
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
        const int numP = dimensions().TotalSize() / (numX*numY*numZ);

        const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
@ -860,16 +1011,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
        assert(shared_mem <= maxSharedMem);

        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-        const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
-        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
-        internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+                                      m_indices[idxZ]);
+        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+                                          m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]);
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+            m_inputImpl.dimensions(), kernel_dims, indices);

        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
        break;
      }

      default: {
-        assert(false && "not supported yet");
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
      }
    }
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@ -32,8 +32,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
      Assign assign(m_expression, other);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
      return *this;
    }

@ -44,8 +43,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
      Sum sum(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
      Assign assign(m_expression, sum);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
      return *this;
    }

@ -56,8 +54,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
      Difference difference(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
      Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
      return *this;
    }

@ -76,8 +73,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
      Assign assign(m_expression, other);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
      return *this;
    }

@ -88,8 +84,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
      Sum sum(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
      Assign assign(m_expression, sum);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
      return *this;
    }

@ -100,8 +95,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
      Difference difference(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
      Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
      return *this;
    }

@ -122,7 +116,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
      return *this;
    }

@ -133,7 +127,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
      Sum sum(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
      return *this;
    }

@ -144,14 +138,13 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
      Difference difference(m_expression, other);
      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
      Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
      return *this;
    }

  protected:
    const GpuDevice& m_device;
-    ExpressionType m_expression;
+    ExpressionType& m_expression;
 };
 #endif

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -145,39 +145,39 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0

  Sizes() { }
  template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& indices) {
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
    // todo: add assertion
  }
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
-  explicit Sizes(std::initializer_list<std::size_t> l) {
+  template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
+  explicit Sizes(std::initializer_list<std::size_t>) {
    // todo: add assertion
  }
 #else
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
  }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
  }
 #endif

-  template <typename T> Sizes& operator = (const T& other) {
+  template <typename T> Sizes& operator = (const T&) {
    // to do: check the size of other
    return *this;
  }

  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this);
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
  }
  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this);
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
  }
 };

@ -343,7 +343,7 @@ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::s
 template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -352,11 +352,12 @@ template<typename IfArgType, typename ThenArgType, typename ElseArgType, typenam
 struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
 {
  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;

  enum {
    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
-                                                                                                             TensorEvaluator<IfArgType>::PacketAccess*/,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                   internal::packet_traits<Scalar>::HasBlend,
    Layout = TensorEvaluator<IfArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
  };
@ -373,7 +374,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  }

  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
  typedef typename internal::traits<XprType>::Packet PacketReturnType;
  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
@ -403,7 +403,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
  {
-    static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    internal::Selector<PacketSize> select;
    for (Index i = 0; i < PacketSize; ++i) {
      select.select[i] = m_condImpl.coeff(index+i);
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@ -14,15 +14,16 @@
 using Eigen::Tensor;
 using Eigen::DefaultDevice;

+template <int DataLayout>
 static void test_evals()
 {
-  Tensor<float, 2> input(3, 3);
-  Tensor<float, 1> kernel(2);
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 1, DataLayout> kernel(2);

  input.setRandom();
  kernel.setRandom();

-  Tensor<float, 2> result(2,3);
+  Tensor<float, 2, DataLayout> result(2,3);
  result.setZero();
  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};

@ -41,16 +42,16 @@ static void test_evals()
  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
 }

-
+template <int DataLayout>
 static void test_expr()
 {
-  Tensor<float, 2> input(3, 3);
-  Tensor<float, 2> kernel(2, 2);
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 2, DataLayout> kernel(2, 2);
  input.setRandom();
  kernel.setRandom();

-  Tensor<float, 2> result(2,2);
-  Eigen::array<ptrdiff_t, 2> dims{{0, 1}};
+  Tensor<float, 2, DataLayout> result(2,2);
+  Eigen::array<ptrdiff_t, 2> dims({0, 1});
  result = input.convolve(kernel, dims);

  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
@ -63,10 +64,10 @@ static void test_expr()
                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
 }

-
+template <int DataLayout>
 static void test_modes() {
-  Tensor<float, 1> input(3);
-  Tensor<float, 1> kernel(3);
+  Tensor<float, 1, DataLayout> input(3);
+  Tensor<float, 1, DataLayout> kernel(3);
  input(0) = 1.0f;
  input(1) = 2.0f;
  input(2) = 3.0f;
@ -74,13 +75,13 @@ static void test_modes() {
  kernel(1) = 1.0f;
  kernel(2) = 0.0f;

-  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  const Eigen::array<ptrdiff_t, 1> dims({0});
  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;

  // Emulate VALID mode (as defined in
  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
  padding[0] = std::make_pair(0, 0);
-  Tensor<float, 1> valid(1);
+  Tensor<float, 1, DataLayout> valid(1);
  valid = input.pad(padding).convolve(kernel, dims);
  VERIFY_IS_EQUAL(valid.dimension(0), 1);
  VERIFY_IS_APPROX(valid(0), 2.5f);
@ -88,7 +89,7 @@ static void test_modes() {
  // Emulate SAME mode (as defined in
  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
  padding[0] = std::make_pair(1, 1);
-  Tensor<float, 1> same(3);
+  Tensor<float, 1, DataLayout> same(3);
  same = input.pad(padding).convolve(kernel, dims);
  VERIFY_IS_EQUAL(same.dimension(0), 3);
  VERIFY_IS_APPROX(same(0), 1.0f);
@ -98,7 +99,7 @@ static void test_modes() {
  // Emulate FULL mode (as defined in
  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
  padding[0] = std::make_pair(2, 2);
-  Tensor<float, 1> full(5);
+  Tensor<float, 1, DataLayout> full(5);
  full = input.pad(padding).convolve(kernel, dims);
  VERIFY_IS_EQUAL(full.dimension(0), 5);
  VERIFY_IS_APPROX(full(0), 0.0f);
@ -108,18 +109,18 @@ static void test_modes() {
  VERIFY_IS_APPROX(full(4), 1.5f);
 }

-
+template <int DataLayout>
 static void test_strides() {
-  Tensor<float, 1> input(13);
-  Tensor<float, 1> kernel(3);
+  Tensor<float, 1, DataLayout> input(13);
+  Tensor<float, 1, DataLayout> kernel(3);
  input.setRandom();
  kernel.setRandom();

-  const Eigen::array<ptrdiff_t, 1> dims{{0}};
-  const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
-  const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
+  const Eigen::array<ptrdiff_t, 1> dims({0});
+  const Eigen::array<ptrdiff_t, 1> stride_of_3({3});
+  const Eigen::array<ptrdiff_t, 1> stride_of_2({2});

-  Tensor<float, 1> result;
+  Tensor<float, 1, DataLayout> result;
  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);

  VERIFY_IS_EQUAL(result.dimension(0), 2);
@ -129,13 +130,14 @@ static void test_strides() {
                               input(12)*kernel(2)));
 }

-
-
-
 void test_cxx11_tensor_convolution()
 {
-  CALL_SUBTEST(test_evals());
-  CALL_SUBTEST(test_expr());
-  CALL_SUBTEST(test_modes());
-  CALL_SUBTEST(test_strides());
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_modes<ColMajor>());
+  CALL_SUBTEST(test_modes<RowMajor>());
+  CALL_SUBTEST(test_strides<ColMajor>());
+  CALL_SUBTEST(test_strides<RowMajor>());
 }
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cpp
@ -117,11 +117,10 @@ void test_cuda_elementwise()
  }
 }

-
 void test_cuda_reduction()
 {
-  Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113));
-  Tensor<float, 2> out(Eigen::array<int, 2>(72,97));
+  Tensor<float, 4> in1(72,53,97,113);
+  Tensor<float, 2> out(72,97);
  in1.setRandom();

  std::size_t in1_bytes = in1.size() * sizeof(float);
@ -138,8 +137,8 @@ void test_cuda_reduction()
  assert(cudaStreamCreate(&stream) == cudaSuccess);
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113));
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);

  array<int, 2> reduction_axis;
  reduction_axis[0] = 1;
@ -156,10 +155,10 @@ void test_cuda_reduction()
      for (int k = 0; k < 53; ++k) {
        for (int l = 0; l < 113; ++l) {
          expected =
-              std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l)));
+              std::max<float>(expected, in1(i, k, j, l));
        }
      }
-      VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected);
+      VERIFY_IS_APPROX(out(i,j), expected);
    }
  }
 }
@ -170,7 +169,7 @@ static void test_cuda_contraction()
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
-  Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31));
+  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));

@ -196,12 +195,9 @@ static void test_cuda_contraction()
  assert(cudaStreamCreate(&stream) == cudaSuccess);
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> >
-      gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31));
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
-      gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1));
-  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
-      gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1));
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);

  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
  MapXf m_left(t_left.data(), 300, 93);
@ -226,11 +222,12 @@ static void test_cuda_contraction()
  }
 }

+template<int DataLayout>
 static void test_cuda_convolution_1d()
 {
-  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
-  Tensor<float, 1> kernel(Eigen::array<int, 1>(4));
-  Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137));
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 1, DataLayout> kernel(4);
+  Tensor<float, 4, DataLayout> out(74,34,11,137);
  input = input.constant(10.0f) + input.random();
  kernel = kernel.constant(7.0f) + kernel.random();

@ -252,9 +249,9 @@ static void test_cuda_convolution_1d()
  assert(cudaStreamCreate(&stream) == cudaSuccess);
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
-  Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4));
-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);

  Eigen::array<int, 1> dims(1);
  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -266,11 +263,9 @@ static void test_cuda_convolution_1d()
    for (int j = 0; j < 34; ++j) {
      for (int k = 0; k < 11; ++k) {
        for (int l = 0; l < 137; ++l) {
-          const float result = out(Eigen::array<int, 4>(i,j,k,l));
-          const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) +
-                                 input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) +
-                                 input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) +
-                                 input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3));
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
          VERIFY_IS_APPROX(result, expected);
        }
      }
@ -278,12 +273,11 @@ static void test_cuda_convolution_1d()
  }
 }

-
-static void test_cuda_convolution_2d()
+static void test_cuda_convolution_inner_dim_col_major_1d()
 {
-  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
-  Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4));
-  Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137));
+  Tensor<float, 4, ColMajor> input(74,9,11,7);
+  Tensor<float, 1, ColMajor> kernel(4);
+  Tensor<float, 4, ColMajor> out(71,9,11,7);
  input = input.constant(10.0f) + input.random();
  kernel = kernel.constant(7.0f) + kernel.random();

@ -305,9 +299,110 @@ static void test_cuda_convolution_2d()
  assert(cudaStreamCreate(&stream) == cudaSuccess);
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
-  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4));
-  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+  Eigen::array<int, 1> dims(0);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 71; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+static void test_cuda_convolution_inner_dim_row_major_1d()
+{
+  Tensor<float, 4, RowMajor> input(7,9,11,74);
+  Tensor<float, 1, RowMajor> kernel(4);
+  Tensor<float, 4, RowMajor> out(7,9,11,71);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+  Eigen::array<int, 1> dims(3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 71; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_cuda_convolution_2d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 2, DataLayout> kernel(3,4);
+  Tensor<float, 4, DataLayout> out(74,35,8,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);

  Eigen::array<int, 2> dims(1,2);
  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -319,32 +414,32 @@ static void test_cuda_convolution_2d()
    for (int j = 0; j < 35; ++j) {
      for (int k = 0; k < 8; ++k) {
        for (int l = 0; l < 137; ++l) {
-          const float result = out(Eigen::array<int, 4>(i,j,k,l));
-          const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) +
-                                 input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) +
-                                 input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) +
-                                 input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) +
-                                 input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) +
-                                 input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) +
-                                 input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) +
-                                 input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) +
-                                 input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) +
-                                 input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) +
-                                 input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) +
-                                 input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3));
-            VERIFY_IS_APPROX(result, expected);
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                 input(i,j+1,k+0,l) * kernel(1,0) +
+                                 input(i,j+2,k+0,l) * kernel(2,0) +
+                                 input(i,j+0,k+1,l) * kernel(0,1) +
+                                 input(i,j+1,k+1,l) * kernel(1,1) +
+                                 input(i,j+2,k+1,l) * kernel(2,1) +
+                                 input(i,j+0,k+2,l) * kernel(0,2) +
+                                 input(i,j+1,k+2,l) * kernel(1,2) +
+                                 input(i,j+2,k+2,l) * kernel(2,2) +
+                                 input(i,j+0,k+3,l) * kernel(0,3) +
+                                 input(i,j+1,k+3,l) * kernel(1,3) +
+                                 input(i,j+2,k+3,l) * kernel(2,3);
+          VERIFY_IS_APPROX(result, expected);
        }
      }
    }
  }
 }

-
+template<int DataLayout>
 static void test_cuda_convolution_3d()
 {
-  Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17));
-  Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2));
-  Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17));
+  Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 3, DataLayout> kernel(3,4,2);
+  Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
  input = input.constant(10.0f) + input.random();
  kernel = kernel.constant(7.0f) + kernel.random();

@ -366,9 +461,9 @@ static void test_cuda_convolution_3d()
  assert(cudaStreamCreate(&stream) == cudaSuccess);
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2));
-  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17));
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);

  Eigen::array<int, 3> dims(1,2,3);
  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -381,31 +476,31 @@ static void test_cuda_convolution_3d()
      for (int k = 0; k < 8; ++k) {
        for (int l = 0; l < 136; ++l) {
          for (int m = 0; m < 17; ++m) {
-            const float result = out(Eigen::array<int, 5>(i,j,k,l,m));
-            const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) +
-                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) +
-                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) +
-                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1));
+            const float result = out(i,j,k,l,m);
+            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
            VERIFY_IS_APPROX(result, expected);
          }
        }
@ -414,91 +509,6 @@ static void test_cuda_convolution_3d()
  }
 }

-static float* CudaCopyFloat(float* data, int size) {
-  const int nbytes = size * sizeof(float);
-  float* result = NULL;
-  if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) {
-    return NULL;
-  } else {
-    if (data != NULL) {
-      cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice);
-    }
-    return result;
-  }
-}
-
-static void test_cuda_constant_broadcast()
-{
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Tensor<float, 1> t1(10);
-  for (int i = 0; i < 10; ++i) {
-    t1(i) = 10.0f * i;
-  }
-  float* t1_cuda = CudaCopyFloat(t1.data(), t1.size());
-  Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10);
-
-  Tensor<float, 1> t2(1);
-  t2 = t2.constant(20.0f);
-  float* t2_cuda = CudaCopyFloat(t2.data(), t2.size());
-  Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1);
-
-  float* t3_cuda = CudaCopyFloat(NULL, 10);
-  Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10);
-
-  t3_gpu.device(gpu_device) =
-      t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10));
-
-  Eigen::Tensor<float, 1> t3(10);
-  cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float),
-             cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < 10; ++i) {
-    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
-  }
-}
-
-
-void test_cuda_cast()
-{
-  Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
-  in.setRandom();
-
-  std::size_t in_bytes = in.size() * sizeof(double);
-  std::size_t out_bytes = out.size() * sizeof(float);
-
-  double* d_in;
-  float* d_out;
-  cudaMalloc((void**)(&d_in), in_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
-
-  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
-
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
-  Eigen::GpuDevice gpu_device(&stream);
-
-  Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
-
-  gpu_out.device(gpu_device) = gpu_in.template cast<float>();
-
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
-
-  for (int i = 0; i < 72; ++i) {
-    for (int j = 0; j < 53; ++j) {
-      for (int k = 0; k < 97; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k))));
-      }
-    }
-  }
-}
-
-
 void test_cxx11_tensor_cuda()
 {
  CALL_SUBTEST(test_cuda_elementwise_small());
@ -506,9 +516,12 @@ void test_cxx11_tensor_cuda()
  CALL_SUBTEST(test_cuda_reduction());
  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d());
-  CALL_SUBTEST(test_cuda_convolution_2d());
-  CALL_SUBTEST(test_cuda_convolution_3d());
-  CALL_SUBTEST(test_cuda_constant_broadcast());
-  CALL_SUBTEST(test_cuda_cast());
+  CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
 }