diff --git a/Eigen/Core b/Eigen/Core
index b7205bda5..16007d0c9 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -310,6 +310,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/PacketMath.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+  #include "src/Core/arch/NEON/BlockingSizesLookupTables.h"
 #endif
 
 #if defined EIGEN_VECTORIZE_CUDA
@@ -383,6 +384,7 @@ using std::ptrdiff_t;
 #include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
+#include "src/Core/products/LookupBlockingSizesTable.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
 #include "src/Core/ProductEvaluators.h"
diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index c06668bd2..7fab9eed0 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -12,24 +12,26 @@
   * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
   *  - IdentityPreconditioner - not really useful
   *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
   *
   * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
   *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
   */
 
 #include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
 
diff --git a/Eigen/Sparse b/Eigen/Sparse
index 7cc9c0913..a540f0eec 100644
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -11,9 +11,9 @@
   * - \ref SparseQR_Module
   * - \ref IterativeLinearSolvers_Module
   *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
   */
 
 #include "SparseCore"
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index f46f7b758..93a726483 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -226,6 +226,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
 
     /** \internal
       * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -424,6 +429,8 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 629c87161..745b74d95 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -170,6 +170,12 @@ template<typename _MatrixType, int _UpLo> class LLT
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
@@ -377,6 +383,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 9485080d3..ce00566a5 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -647,11 +647,15 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
     IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
+    
     KeepsPacketAccess = bool(HasNoInnerStride)
                         && ( bool(IsDynamicSize)
                            || HasNoOuterStride
                            || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ),
     Flags0 = evaluator<PlainObjectType>::Flags,
     Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
     Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
@@ -717,7 +721,10 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
                        && (InnerStrideAtCompileTime == 1)
                         ? PacketAccessBit : 0,
     
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
+    
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0,
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
     Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
@@ -825,12 +832,16 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
                       typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    const int AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar);
+    EIGEN_ONLY_USED_FOR_DEBUG(AlignBytes)
     // FIXME this should be an internal assertion
-    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned");
   }
 };
 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 9186f59a7..ab41641f4 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -34,14 +34,35 @@ void check_static_allocation_size()
   #endif
 }
 
+template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
+         bool Match     = bool((Size%unpacket_traits<Packet>::size)==0),
+         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > Size)
+                        && bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) >
+struct compute_default_alignment
+{
+  enum { value = 0 };
+};
+
+template<typename T, int Size, typename Packet>
+struct compute_default_alignment<T, Size, Packet, true, false> // Match
+{
+  enum { value = sizeof(T) * unpacket_traits<Packet>::size };
+};
+
+template<typename T, int Size, typename Packet>
+struct compute_default_alignment<T, Size, Packet, false, true>
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value };
+};
+
 /** \internal
   * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
   * to 16 bytes boundary if the total size is a multiple of 16 bytes.
   */
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
   T array[Size];
@@ -81,14 +102,71 @@ struct plain_array
 #endif
 
 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
   EIGEN_DEVICE_FUNC
   plain_array() 
   { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
     check_static_allocation_size<T,Size>();
   }
 
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 878f38e92..e1b233d82 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -473,48 +473,48 @@ struct random_default_impl<Scalar, false, false>
 };
 
 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
   enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
   };
 };
 
 template<unsigned int n,
          int lower = 0,
          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
   enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
   // no value, error at compile time
 };
@@ -522,11 +522,24 @@ struct floor_log2<n, lower, upper, floor_log2_bogus>
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
-
   static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  { 
+    using std::max;
+    using std::min;
+    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if(y<x)
+      return x;
+    std::size_t range = ScalarX(y)-ScalarX(x);
+    std::size_t offset = 0;
+    // rejection sampling
+    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
+    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
+
+    do {
+      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+    } while (offset > range);
+
+    return Scalar(ScalarX(x) + offset);
   }
 
   static inline Scalar run()
@@ -534,7 +547,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
     return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
            scalar_bits = sizeof(Scalar) * CHAR_BIT,
            shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
            offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 0cb117949..ea5a2bd5c 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -105,7 +105,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
       OuterStrideMatch = Derived::IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
       AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
@@ -184,9 +185,11 @@ protected:
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
     typedef internal::traits<Ref> Traits;
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr);
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
   public:
 
     typedef RefBase<Ref> Base;
@@ -195,13 +198,15 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       Base::construct(expr.derived());
     }
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
@@ -228,7 +233,8 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 19749c832..ceed1d1ef 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
 }
 #endif
 
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
   return make_double2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
   to[stride*2] = from.z;
   to[stride*3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
 }
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
 }
 
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(abs(a.x), abs(a.y));
+  return make_double2(fabs(a.x), fabs(a.y));
 }
 
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
   kernel.packet[3].z = tmp;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<double2,2>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
new file mode 100644
index 000000000..5007c155d
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+
+namespace Eigen {
+namespace internal {
+
+/* The following lookup table was generated from measurements on a Nexus 5,
+ * which has a Qualcomm Krait 400 CPU. This is very representative of current
+ * 32bit (ARMv7) Android devices. On the other hand, I don't know how
+ * representative that is outside of these conditions. Accordingly,
+ * let's only use this lookup table on ARM 32bit on Android for now.
+ *
+ * Measurements were single-threaded, with Scalar=float, compiled with
+ * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
+ *
+ * The device was cooled, allowing it to run a the max clock speed throughout.
+ * This may not be representative of real-world thermal conditions.
+ *
+ * The benchmark attempted to flush caches to test cold-cache performance.
+ */
+#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
+template<>
+struct BlockingSizesLookupTable<float, float> {
+  static const size_t BaseSize = 16;
+  static const size_t NumSizes = 8;
+  static const unsigned short* Data() {
+    static const unsigned short data[512] = {
+      0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
+      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
+      0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
+      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
+      0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
+      0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
+      0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
+      0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
+      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
+      0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
+      0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
+      0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
+      0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
+      0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
+      0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
+      0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
+      0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
+      0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
+      0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
+      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
+      0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
+      0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
+      0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
+      0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
+      0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
+      0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
+      0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
+      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
+      0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
+      0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
+      0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
+      0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
+      0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
+      0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
+      0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
+      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
+      0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
+      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
+      0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
+      0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
+      0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
+      0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
+      0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+      0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
+      0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
+      0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
+      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
+      0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
+      0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
+      0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
+    };
+    return data;
+  }
+};
+#endif
+
+}
+}
+
+#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 154daa7a7..c7fb12fe8 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -272,7 +272,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 }
 
 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 8dd1e1370..ce0abfd80 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -518,7 +518,19 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 }
 
 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 #if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
 // Bug 907: workaround missing declarations of the following two functions in the ADK
@@ -541,12 +553,12 @@ typedef float64x1_t Packet1d;
 template<> struct packet_traits<double>  : default_packet_traits
 {
   typedef Packet2d type;
-  typedef Packet1d half;
+  typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket=1,
+    HasHalfPacket=0,
    
     HasDiv  = 1,
     // FIXME check the Has*
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index a3b5f50b1..4e03761ea 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -55,6 +55,34 @@ struct functor_traits<scalar_abs_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
 /** \internal
   * \brief Template functor to compute the squared absolute value of a scalar
   *
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 9061fd936..d32377a00 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,21 +25,31 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
   static bool m_cache_sizes_initialized = false;
-  static std::ptrdiff_t m_l1CacheSize = 32*1024;
-  static std::ptrdiff_t m_l2CacheSize = 256*1024;
-  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  static std::ptrdiff_t m_l3CacheSize = 0;
 
   if(!m_cache_sizes_initialized)
   {
     int l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
-    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
-    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
-    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
     m_cache_sizes_initialized = true;
   }
 
@@ -64,43 +74,23 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
   }
 }
 
-/** \brief Computes the blocking parameters for a m x k times k x n matrix product
-  *
-  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
-  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
-  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
-  *
-  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
-  * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
-  *
-  * \sa setCpuCacheSizes */
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
 
 template<typename LhsScalar, typename RhsScalar, int KcFactor>
-void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-  EIGEN_UNUSED_VARIABLE(num_threads);
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-  if (k > kr) k -= k % kr;
-  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-  if (m > mr) m -= m % mr;
-  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
-  if (n > nr) n -= n % nr;
-  return;
-#endif
-
   // Explanations:
   // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
   // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
@@ -155,7 +145,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // In unit tests we do not want to use extra large matrices,
     // so we reduce the cache size to check the blocking strategy is not flawed
 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    l1 = 4*1024;
+    l1 = 9*1024;
     l2 = 32*1024;
     l3 = 512*1024;
 #endif
@@ -164,7 +154,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // Perhaps it would make more sense to consider k*n*m??
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
-    if(std::max(k,std::max(m,n))<48)
+    if((std::max)(k,(std::max)(m,n))<48)
       return;
     
     typedef typename Traits::ResScalar ResScalar;
@@ -211,8 +201,22 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
-    // to limit this growth: we bound nc to growth by a factor x1.5, leading to:
-    const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
     // WARNING Below, we assume that Traits::nr is a power of two.
     Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
     if(n>nc)
@@ -228,6 +232,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     {
       // So far, no blocking at all, i.e., kc==k, and nc==n.
       // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
       Index problem_size = k*n*sizeof(LhsScalar);
       Index actual_lm = actual_l2;
       Index max_mc = m;
@@ -254,6 +259,60 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   }
 }
 
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using a precomputed lookup table;
+  *   - or using fixed prescribed values (for testing purposes).
+  *
+  * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    if (!lookupBlockingSizesFromTable<LhsScalar, RhsScalar>(k, m, n, num_threads)) {
+      evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+    }
+  }
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
+}
+
 template<typename LhsScalar, typename RhsScalar>
 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
@@ -949,19 +1008,31 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // This corresponds to 3*LhsProgress x nr register blocks.
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
-    {
+    {      
       PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
-
-      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
-      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+      
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
         for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 3 x nr registers.
           
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
           prefetch(&blA[0]);
 
           // gets res block as register
@@ -1093,12 +1164,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C11, alphav, R2);
           r3.storePacket(0 * Traits::ResPacketSize, R0);
           r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
 
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
           prefetch(&blA[0]);
@@ -1169,7 +1243,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C8, alphav, R2);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
       }
     }
@@ -1177,13 +1252,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
         for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 2 x nr registers.
           
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
@@ -1287,11 +1370,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r3.storePacket(0 * Traits::ResPacketSize, R2);
           r3.storePacket(1 * Traits::ResPacketSize, R3);
+          }
         }
-
+      
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
           prefetch(&blA[0]);
@@ -1358,6 +1444,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
         }
       }
     }
@@ -1479,17 +1566,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
             RhsPacket B_0;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
             do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
               traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
             } while(false);
 
             EIGEN_GEBGP_ONESTEP(0);
@@ -1504,7 +1591,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             blB += pk*RhsProgress;
             blA += pk*1*Traits::LhsProgress;
 
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
           }
 
           // process remaining peeled loop
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c76f48154..7fd707ed7 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -457,6 +457,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
   {
     eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
 
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
new file mode 100644
index 000000000..5ab4525df
--- /dev/null
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename LhsScalar,
+          typename RhsScalar,
+          bool HasLookupTable = BlockingSizesLookupTable<LhsScalar, RhsScalar>::NumSizes != 0 >
+struct LookupBlockingSizesFromTableImpl
+{
+  static bool run(Index&, Index&, Index&, Index)
+  {
+    return false;
+  }
+};
+
+inline size_t floor_log2_helper(unsigned short& x, size_t offset)
+{
+  unsigned short y = x >> offset;
+  if (y) {
+    x = y;
+    return offset;
+  } else {
+    return 0;
+  }
+}
+
+inline size_t floor_log2(unsigned short x)
+{
+  return floor_log2_helper(x, 8)
+       + floor_log2_helper(x, 4)
+       + floor_log2_helper(x, 2)
+       + floor_log2_helper(x, 1);
+}
+
+inline size_t ceil_log2(unsigned short x)
+{
+  return x > 1 ? floor_log2(x - 1) + 1 : 0;
+}
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
+{
+  static bool run(Index& k, Index& m, Index& n, Index)
+  {
+    using std::min;
+    using std::max;
+    typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
+    const unsigned short minsize = Table::BaseSize;
+    const unsigned short maxsize = minsize << (Table::NumSizes - 1);
+    const unsigned short k_clamped = max<unsigned short>(minsize, min<Index>(k, maxsize));
+    const unsigned short m_clamped = max<unsigned short>(minsize, min<Index>(m, maxsize));
+    const unsigned short n_clamped = max<unsigned short>(minsize, min<Index>(n, maxsize));
+    const size_t k_index = ceil_log2(k_clamped / minsize);
+    const size_t m_index = ceil_log2(m_clamped / minsize);
+    const size_t n_index = ceil_log2(n_clamped / minsize);
+    const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
+    const unsigned short table_entry = Table::Data()[index];
+    k = min<Index>(k, 1 << ((table_entry & 0xf00) >> 8));
+    m = min<Index>(m, 1 << ((table_entry & 0x0f0) >> 4));
+    n = min<Index>(n, 1 << ((table_entry & 0x00f) >> 0));
+    return true;
+  }
+};
+
+template <typename LhsScalar,
+          typename RhsScalar>
+bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
+{
+  return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
+}
+
+}
+
+}
+
+#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 9bfa45106..ffeb5ac5f 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -214,7 +214,7 @@ class blas_data_mapper {
   }
 
   template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
   }
 
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 0d24beb5a..503d5acdf 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -288,6 +288,14 @@ struct stem_function
   typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct BlockingSizesLookupTable
+{
+  static const size_t NumSizes = 0;
+};
+
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index aaea9f035..6b294e77f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -318,6 +318,9 @@
 // Defined the boundary (in bytes) on which the data needs to be aligned. Note
 // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
 // aligned at all regardless of the value of this #define.
+// TODO should be renamed EIGEN_MAXIMAL_ALIGN_BYTES,
+//      for instance with AVX 1 EIGEN_MAXIMAL_ALIGN_BYTES=32 while for 'int' 16 bytes alignment is always enough,
+//      and 16 bytes alignment is also enough for Vector4f.
 #define EIGEN_ALIGN_BYTES 16
 
 #ifdef EIGEN_DONT_ALIGN
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 528ebe297..562f425bd 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -159,13 +159,16 @@ class compute_matrix_evaluator_flags
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+      
+      // TODO: should check for smaller packet types once we can handle multi-sized packet types
+      align_bytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
 
       aligned_bit =
       (
             ((Options&DontAlign)==0)
         && (
 #if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0))
 #else
              0
 #endif
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 075a62848..6b010c312 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -234,6 +234,12 @@ template<typename _MatrixType> class ComplexEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     EigenvectorType m_eivec;
     EigenvalueType m_eivalues;
     ComplexSchur<MatrixType> m_schur;
@@ -251,6 +257,8 @@ template<typename MatrixType>
 ComplexEigenSolver<MatrixType>& 
 ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   // this code is inspired from Jampack
   eigen_assert(matrix.cols() == matrix.rows());
 
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index a63a42341..167cd99ab 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -299,6 +299,13 @@ template<typename _MatrixType> class EigenSolver
     void doComputeEigenvectors();
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     EigenvalueType m_eivalues;
     bool m_isInitialized;
@@ -366,6 +373,8 @@ template<typename MatrixType>
 EigenSolver<MatrixType>& 
 EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   using numext::isfinite;
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index c9da6740a..e2e28cd4a 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -263,6 +263,13 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
@@ -290,6 +297,8 @@ template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 66d1154cf..1dcfacf0b 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -347,6 +347,11 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
@@ -382,6 +387,8 @@ EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 ::compute(const MatrixType& matrix, int options)
 {
+  check_template_parameters();
+  
   using std::abs;
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index e1ad803bb..e90ce77eb 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -441,7 +441,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
                          typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                         (internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned)?Aligned:Unaligned>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
@@ -646,6 +646,16 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+{
+  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
+    return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
+  }
+};
+}
+                         
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
   * if the quaternion is normalized.
   * The conjugate of a quaternion represents the opposite rotation.
@@ -656,7 +666,10 @@ template <class Derived>
 inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return internal::quat_conj<Architecture::Target, Derived,
+                         typename internal::traits<Derived>::Scalar,
+                         internal::traits<Derived>::IsAligned?Aligned:Unaligned>::run(*this);
+                         
 }
 
 /** \returns the angle (in radian) between two rotations
@@ -667,12 +680,10 @@ template <class OtherDerived>
 inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
+  using std::atan2;
   using std::abs;
-  Scalar d = abs(this->dot(other));
-  if (d>=Scalar(1))
-    return Scalar(0);
-  return Scalar(2) * acos(d);
+  Quaternion<Scalar> d = (*this) * other.conjugate();
+  return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) );
 }
 
  
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index 3d8284f2d..e59c32c56 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -20,23 +20,35 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
 {
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
     Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
     __m128 a = _a.coeffs().template packet<Aligned>(0);
     __m128 b = _b.coeffs().template packet<Aligned>(0);
-    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
-                                         vec4f_swizzle1(b,2,0,1,2)),mask);
-    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
-                                         vec4f_swizzle1(b,0,1,2,1)),mask);
+    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstore(&res.x(),
               _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
                                     _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_add_ps(flip1,flip2)));
+                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+    
     return res;
   }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+{
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+    return res;
+  }
+};
+
+
 template<typename VectorLhs,typename VectorRhs>
 struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 {
@@ -56,8 +68,8 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 
 
 
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
+template<class Derived, class OtherDerived, int Alignment>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
 {
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
@@ -66,8 +78,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
   Quaternion<double> res;
 
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
+  Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -108,6 +120,20 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
 }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+{
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<double> res;
+    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
+    const __m128d mask2 = _mm_setr_pd(-0.,0.);
+    pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
+    pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+    return res;
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index a09f81225..3710a8209 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -17,9 +17,9 @@ namespace Eigen {
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
   *
   * \tparam _Scalar the type of the scalar.
   *
@@ -28,6 +28,7 @@ namespace Eigen {
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
 template <typename _Scalar>
 class DiagonalPreconditioner
@@ -100,6 +101,69 @@ class DiagonalPreconditioner
     bool m_isInitialized;
 };
 
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  * 
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
+  */
+template <typename _Scalar>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
+{
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DiagonalPreconditioner<_Scalar> Base;
+    using Base::m_invdiag;
+  public:
+
+    LeastSquareDiagonalPreconditioner() : Base() {}
+
+    template<typename MatType>
+    explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base()
+    {
+      compute(mat);
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      // Compute the inverse squared-norm of each column of mat
+      m_invdiag.resize(mat.cols());
+      for(Index j=0; j<mat.outerSize(); ++j)
+      {
+        RealScalar sum = mat.innerVector(j).squaredNorm();
+        if(sum>0)
+          m_invdiag(j) = RealScalar(1)/sum;
+        else
+          m_invdiag(j) = RealScalar(1);
+      }
+      Base::m_isInitialized = true;
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+
+  protected:
+};
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index a799c3ef5..9e7dd1404 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -60,29 +60,29 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   }
   
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);      // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
   Index i = 0;
   while(i < maxIters)
   {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+    tmp.noalias() = mat * p;                    // the bottleneck of the algorithm
 
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
+    Scalar alpha = absNew / p.dot(tmp);         // the amount we travel on dir
+    x += alpha * p;                             // update solution
+    residual -= alpha * tmp;                    // update residual
     
     residualNorm2 = residual.squaredNorm();
     if(residualNorm2 < threshold)
       break;
     
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    z = precond.solve(residual);                // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
     absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    RealScalar beta = absNew / absOld;          // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                           // update search direction
     i++;
   }
   tol_error = sqrt(residualNorm2 / rhsNorm2);
@@ -122,24 +122,24 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double> > cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
@@ -185,7 +185,7 @@ public:
   {
     typedef typename internal::conditional<UpLo==(Lower|Upper),
                                            Ref<const MatrixType>&,
-                                           SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                           typename Ref<const MatrixType>::template ConstSelfAdjointViewReturnType<UpLo>::Type
                                           >::type MatrixWrapperType;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
@@ -208,7 +208,7 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-    x.setOnes();
+    x.setZero();
     _solve_with_guess_impl(b.derived(),x);
   }
 
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 6d63d45e4..b7f8debb3 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -93,21 +93,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * alternatively, on GMANE:
   *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
   */
-template <typename _Scalar>
-class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar> >
+template <typename _Scalar, typename _StorageIndex = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageIndex> >
 {
   protected:
-    typedef SparseSolverBase<IncompleteLUT<_Scalar> > Base;
+    typedef SparseSolverBase<IncompleteLUT> Base;
     using Base::m_isInitialized;
   public:
     typedef _Scalar Scalar;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
 
   public:
+    
+    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
     typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
     
     IncompleteLUT()
@@ -151,7 +153,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar> >
       * 
       **/
     template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    IncompleteLUT& compute(const MatrixType& amat)
     {
       analyzePattern(amat); 
       factorize(amat);
@@ -197,8 +199,8 @@ protected:
  * Set control parameter droptol
  *  \param droptol   Drop any element whose magnitude is less than this tolerance 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
   this->m_droptol = droptol;   
 }
@@ -207,15 +209,15 @@ void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
  * Set control parameter fillfactor
  * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
   this->m_fillfactor = fillfactor;   
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
@@ -232,9 +234,9 @@ void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
   m_analysisIsOk = true;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
 {
   using std::sqrt;
   using std::swap;
@@ -246,8 +248,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   m_lu.resize(n,n);
   // Declare Working vectors and variables
   Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  VectorI ju(n);   // column position of the values in u -- maximum size  is n
+  VectorI jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
@@ -398,7 +400,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
@@ -427,14 +429,13 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizeu = len + 1; // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
     typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
     for(Index k = ii + 1; k < ii + len; k++)
       m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 46bc0ac78..6477b9de2 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -52,9 +52,9 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
-    : mp_matrix(A)
+  template<typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
+    : mp_matrix(A.derived())
   {
     init();
     compute(mp_matrix);
@@ -67,8 +67,8 @@ public:
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
     * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  template<typename SparseMatrixDerived>
-  Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
     m_preconditioner.analyzePattern(mp_matrix);
@@ -87,8 +87,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
     grab(A.derived());
@@ -108,8 +108,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
     m_preconditioner.compute(mp_matrix);
@@ -223,11 +223,11 @@ protected:
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
   
-  template<typename SparseMatrixDerived>
-  void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &A)
   {
     mp_matrix.~Ref<const MatrixType>();
-    ::new (&mp_matrix) Ref<const MatrixType>(A);
+    ::new (&mp_matrix) Ref<const MatrixType>(A.derived());
   }
   
   void grab(const Ref<const MatrixType> &A)
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 000000000..1d819927e
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of A'Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                     const Preconditioner& precond, Index& iters,
+                                     typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+  
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual        = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(normal_residual);                         // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();      // the amount we travel on dir
+    x += alpha * p;                                 // update solution
+    residual -= alpha * tmp;                        // update residual
+    normal_residual = mat.adjoint() * residual;     // update residual of the normal equation
+    
+    residualNorm2 = normal_residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(normal_residual);             // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;              // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                               // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
+class LeastSquaresConjugateGradient;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
+  using Base::mp_matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  LeastSquaresConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  explicit LeastSquaresConjugateGradient(const MatrixType& A) : Base(A) {}
+
+  ~LeastSquaresConjugateGradient() {}
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::least_square_conjugate_gradient(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 49c8b183d..75dbc16b0 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -390,6 +390,12 @@ template<typename _MatrixType> class FullPivLU
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
@@ -434,6 +440,8 @@ FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
 template<typename MatrixType>
 FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   // the permutations are stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
   
@@ -459,14 +467,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
@@ -479,7 +489,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index e57b36bc5..019fc4230 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -209,6 +209,12 @@ template<typename _MatrixType> class PartialPivLU
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
@@ -275,6 +281,8 @@ struct partial_lu_impl
     */
   static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
@@ -286,13 +294,13 @@ struct partial_lu_impl
       Index rcols = cols-k-1;
         
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
@@ -423,6 +431,8 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
 template<typename MatrixType>
 PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   // the row permutation is stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<NumTraits<int>::highest());
   
diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
index 3d2981f0c..63d996cb4 100644
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@@ -137,22 +137,27 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, Perm
     degree[i] = len[i];                 // degree of node i
   }
   mark = internal::cs_wclear<StorageIndex>(0, 0, w, n);         /* clear w */
-  elen[n] = -2;                         /* n is a dead element */
-  Cp[n] = -1;                           /* n is a root of assembly tree */
-  w[n] = 0;                             /* n is a dead element */
   
   /* --- Initialize degree lists ------------------------------------------ */
   for(i = 0; i < n; i++)
   {
+    bool has_diag = false;
+    for(p = Cp[i]; p<Cp[i+1]; ++p)
+      if(Ci[p]==i)
+      {
+        has_diag = true;
+        break;
+      }
+   
     d = degree[i];
-    if(d == 0)                         /* node i is empty */
+    if(d == 1)                      /* node i is empty */
     {
       elen[i] = -2;                 /* element i is dead */
       nel++;
       Cp[i] = -1;                   /* i is a root of assembly tree */
       w[i] = 0;
     }
-    else if(d > dense)                 /* node i is dense */
+    else if(d > dense || !has_diag)  /* node i is dense or has no structural diagonal element */
     {
       nv[i] = 0;                    /* absorb i into element n */
       elen[i] = -1;                 /* node i is dead */
@@ -168,6 +173,10 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, Perm
     }
   }
   
+  elen[n] = -2;                         /* n is a dead element */
+  Cp[n] = -1;                           /* n is a root of assembly tree */
+  w[n] = 0;                             /* n is a dead element */
+  
   while (nel < n)                         /* while (selecting pivots) do */
   {
     /* --- Select node of minimum approximate degree -------------------- */
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index e88e637a4..cb838d04a 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -90,11 +90,11 @@ class AMDOrdering
   * \note Returns an empty permutation matrix
   * \tparam  Index The type of indices of the matrix 
   */
-template <typename Index>
+template <typename StorageIndex>
 class NaturalOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a column-major sparse matrix */
     template <typename MatrixType>
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 03ff0a8f2..7b3842cbe 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -398,6 +398,12 @@ template<typename _MatrixType> class ColPivHouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     PermutationType m_colsPermutation;
@@ -436,6 +442,8 @@ typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDetermina
 template<typename MatrixType>
 ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   using std::abs;
   Index rows = matrix.rows();
   Index cols = matrix.cols();
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 7d5e58d2f..4c2c958a8 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -380,6 +380,12 @@ template<typename _MatrixType> class FullPivHouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -419,6 +425,8 @@ typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDetermin
 template<typename MatrixType>
 FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   using std::abs;
   Index rows = matrix.rows();
   Index cols = matrix.cols();
@@ -443,13 +451,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 195bacb85..878654be5 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -196,6 +196,12 @@ template<typename _MatrixType> class HouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -348,6 +354,8 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
 template<typename MatrixType>
 HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   Index rows = matrix.rows();
   Index cols = matrix.cols();
   Index size = (std::min)(rows,cols);
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index 8903755e7..ad191085e 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -130,9 +130,10 @@ public:
   inline Index rank() const
   {
     using std::abs;
+    using std::max;
     eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
     if(m_singularValues.size()==0) return 0;
-    RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold();
+    RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
     while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
     return i+1;
@@ -217,6 +218,12 @@ public:
   #endif
 
 protected:
+  
+  static void check_template_parameters()
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+  
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
@@ -240,7 +247,9 @@ protected:
       m_usePrescribedThreshold(false),
       m_computationOptions(0),
       m_rows(-1), m_cols(-1), m_diagSize(0)
-  {}
+  {
+    check_template_parameters();
+  }
 
 
 };
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index a0815e708..f56298e8c 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -69,6 +69,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
 
   public:
     
@@ -250,8 +251,8 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     CholMatrixType m_matrix;
     VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
-    VectorXi m_parent;                                // elimination tree
-    VectorXi m_nonZerosPerCol;
+    VectorI m_parent;                                 // elimination tree
+    VectorI m_nonZerosPerCol;
     PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // the permutation
     PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // the inverse permutation
 
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index cae9c38b0..52c7da297 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -86,7 +86,12 @@ class CompressedStorage
     void resize(Index size, double reserveSizeFactor = 0)
     {
       if (m_allocatedSize<size)
-        reallocate(size + Index(reserveSizeFactor*double(size)));
+      {
+        Index realloc_size = (std::min<Index>)(NumTraits<StorageIndex>::highest(),  size + Index(reserveSizeFactor*double(size)));
+        if(realloc_size<size)
+          internal::throw_std_bad_alloc();
+        reallocate(realloc_size);
+      }
       m_size = size;
     }
 
@@ -214,6 +219,9 @@ class CompressedStorage
 
     inline void reallocate(Index size)
     {
+      #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+        EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+      #endif
       eigen_internal_assert(size!=m_allocatedSize);
       internal::scoped_array<Scalar> newValues(size);
       internal::scoped_array<StorageIndex> newIndices(size);
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index acd82e926..2b31716a3 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -49,6 +49,16 @@ public:
       return nnz;
     }
     
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
+    
     inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
     Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
     Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
@@ -204,6 +214,21 @@ public:
     }
     
     bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
+    
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
 
     const Scalar& lastCoeff() const
     {
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4562f3df9..4c3a47959 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -222,24 +222,18 @@ class SparseMatrix
       * The non zero coefficient must \b not already exist.
       *
       * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
+      * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+      * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be
+      * inserted by increasing outer-indices.
+      * 
+      * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first
+      * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
       *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+      * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(IndexVector::Constant(outerSize(), 2));
-      }
-      return insertUncompressed(row,col);
-    }
+    Scalar& insert(Index row, Index col);
 
   public:
 
@@ -1076,6 +1070,118 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   }
 }
 
+template<typename _Scalar, int _Options, typename _Index>
+typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insert(Index row, Index col)
+{
+  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+  
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+  
+  if(isCompressed())
+  {
+    if(nonZeros()==0)
+    {
+      // reserve space if not already done
+      if(m_data.allocatedSize()==0)
+        m_data.reserve(2*m_innerSize);
+      
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      
+      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      
+      // pack all inner-vectors to the end of the pre-allocated space
+      // and allocate the entire free-space to the first inner-vector
+      StorageIndex end = convert_index(m_data.allocatedSize());
+      for(Index j=1; j<=m_outerSize; ++j)
+        m_outerIndex[j] = end;
+    }
+  }
+  
+  // check whether we can do a fast "push back" insertion
+  Index data_end = m_data.allocatedSize();
+  
+  // First case: we are filling a new inner vector which is packed at the end.
+  // We assume that all remaining inner-vectors are also empty and packed to the end.
+  if(m_outerIndex[outer]==data_end)
+  {
+    eigen_internal_assert(m_innerNonZeros[outer]==0);
+    
+    // pack previous empty inner-vectors to end of the used-space
+    // and allocate the entire free-space to the current inner-vector.
+    StorageIndex p = convert_index(m_data.size());
+    Index j = outer;
+    while(j>=0 && m_innerNonZeros[j]==0)
+      m_outerIndex[j--] = p;
+    
+    // push back the new element
+    ++m_innerNonZeros[outer];
+    m_data.append(Scalar(0), inner);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index j=outer+1; j<=m_outerSize; ++j)
+        if(m_outerIndex[j]==data_end)
+          m_outerIndex[j] = new_end;
+    }
+    return m_data.value(p);
+  }
+  
+  // Second case: the next inner-vector is packed to the end
+  // and the current inner-vector end match the used-space.
+  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
+  {
+    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
+    
+    // add space for the new element
+    ++m_innerNonZeros[outer];
+    m_data.resize(m_data.size()+1);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index j=outer+1; j<=m_outerSize; ++j)
+        if(m_outerIndex[j]==data_end)
+          m_outerIndex[j] = new_end;
+    }
+    
+    // and insert it at the right position (sorted insertion)
+    Index startId = m_outerIndex[outer];
+    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
+    while ( (p > startId) && (m_data.index(p-1) > inner) )
+    {
+      m_data.index(p) = m_data.index(p-1);
+      m_data.value(p) = m_data.value(p-1);
+      --p;
+    }
+    
+    m_data.index(p) = convert_index(inner);
+    return (m_data.value(p) = 0);
+  }
+  
+  if(m_data.size() != m_data.allocatedSize())
+  {
+    // make sure the matrix is compatible to random un-compressed insertion:
+    m_data.resize(m_data.allocatedSize());
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(2*m_outerSize, convert_index(m_outerSize)));
+  }
+  
+  return insertUncompressed(row,col);
+}
+    
 template<typename _Scalar, int _Options, typename _Index>
 EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
 {
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index d76dfa33d..55b0ad9d2 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -97,7 +97,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
                         Transpose<const Derived>
                      >::type AdjointReturnType;
     typedef Transpose<Derived> TransposeReturnType;
-    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
     typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
 
     // FIXME storage order do not match evaluator storage order
@@ -300,9 +299,14 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 
     template<int Mode>
     inline const TriangularView<const Derived, Mode> triangularView() const;
+    
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<const Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+    template<unsigned int UpLo> inline 
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> inline
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
 
     template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 6467d4894..3da856799 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -169,17 +169,17 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 ***************************************************************************/
 
 template<typename Derived>
-template<unsigned int Mode>
-const SparseSelfAdjointView<const Derived, Mode> SparseMatrixBase<Derived>::selfadjointView() const
+template<unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView() const
 {
-  return SparseSelfAdjointView<const Derived, Mode>(derived());
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
 template<typename Derived>
-template<unsigned int Mode>
-SparseSelfAdjointView<Derived, Mode> SparseMatrixBase<Derived>::selfadjointView()
+template<unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView()
 {
-  return SparseSelfAdjointView<Derived, Mode>(derived());
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index a603c0216..d563a1d2d 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -1,3 +1,12 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
@@ -7,23 +16,76 @@
 #include <string>
 #include <cmath>
 #include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
 
 using namespace std;
 
+const int default_precision = 4;
+
+// see --only-cubic-sizes
+bool only_cubic_sizes = false;
+
+// see --dump-tables
+bool dump_tables = false;
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  uint16_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+ostream& operator<<(ostream& s, const size_triple_t& t)
+{
+  return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
+}
+
 struct inputfile_entry_t
 {
   uint16_t product_size;
-  uint16_t block_size;
+  uint16_t pot_block_size;
+  size_triple_t nonpot_block_size;
   float gflops;
 };
 
 struct inputfile_t
 {
+  enum class type_t {
+    unknown,
+    all_pot_sizes,
+    default_sizes
+  };
+
   string filename;
   vector<inputfile_entry_t> entries;
+  type_t type;
 
   inputfile_t(const string& fname)
     : filename(fname)
+    , type(type_t::unknown)
   {
     ifstream stream(filename);
     if (!stream.is_open()) {
@@ -31,52 +93,102 @@ struct inputfile_t
       exit(1);
     }
     string line;
-    bool is_in_measurements = false;
     while (getline(stream, line)) {
       if (line.empty()) continue;
-      if (line.find("BEGIN MEASUREMENTS") == 0) {
-        is_in_measurements = true;
+      if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::all_pot_sizes;
         continue;
       }
-
-      if (!is_in_measurements) {
+      if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::default_sizes;
         continue;
       }
+      
 
-      unsigned int product_size, block_size;
-      float gflops;
-      int sscanf_result =
-        sscanf(line.c_str(), "%x %x %f",
-               &product_size,
-               &block_size,
-               &gflops);
-      if (3 != sscanf_result ||
-          !product_size ||
-          product_size > 0xfff ||
-          !block_size ||
-          block_size > 0xfff ||
-          !isfinite(gflops))
-      {
-        cerr << "ill-formed input file: " << filename << endl;
-        cerr << "offending line:" << endl << line << endl;
-        exit(1);
+      if (type == type_t::unknown) {
+        continue;
+      }
+      switch(type) {
+        case type_t::all_pot_sizes: {
+          unsigned int product_size, block_size;
+          float gflops;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x %x %f",
+                   &product_size,
+                   &block_size,
+                   &gflops);
+          if (3 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !block_size ||
+              block_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = uint16_t(block_size);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        case type_t::default_sizes: {
+          unsigned int product_size;
+          float gflops;
+          int bk, bm, bn;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
+                   &product_size,
+                   &bk, &bm, &bn,
+                   &gflops);
+          if (5 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = 0;
+          entry.nonpot_block_size = size_triple_t(bk, bm, bn);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        
+        default:
+          break;
       }
-      inputfile_entry_t entry;
-      entry.product_size = uint16_t(product_size);
-      entry.block_size = uint16_t(block_size);
-      entry.gflops = gflops;
-      entries.push_back(entry);
     }
     stream.close();
-    if (!is_in_measurements) {
-      cerr << "Input file " << filename << " didn't contain a BEGIN MEASUREMENTS line. Wrong file?" << endl;
+    if (type == type_t::unknown) {
+      cerr << "Unrecognized input file " << filename << endl;
       exit(1);
     }
     if (entries.empty()) {
       cerr << "didn't find any measurements in input file: " << filename << endl;
       exit(1);
     }
-    //cerr << "read " << entries.size() << " measurements from " << filename << endl;
   }
 };
 
@@ -88,6 +200,11 @@ struct preprocessed_inputfile_entry_t
   float efficiency;
 };
 
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+  return e1.efficiency < e2.efficiency;
+}
+
 struct preprocessed_inputfile_t
 {
   string filename;
@@ -96,6 +213,9 @@ struct preprocessed_inputfile_t
   preprocessed_inputfile_t(const inputfile_t& inputfile)
     : filename(inputfile.filename)
   {
+    if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
+      abort();
+    }
     auto it = inputfile.entries.begin();
     auto it_first_with_given_product_size = it;
     while (it != inputfile.entries.end()) {
@@ -127,7 +247,7 @@ private:
     for (auto it = begin; it != end; ++it) {
       preprocessed_inputfile_entry_t entry;
       entry.product_size = it->product_size;
-      entry.block_size = it->block_size;
+      entry.block_size = it->pot_block_size;
       entry.efficiency = it->gflops / max_gflops;
       entries.push_back(entry);
     }
@@ -201,14 +321,82 @@ float efficiency_of_subset(
         efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
       }
       efficiency = min(efficiency, efficiency_this_product_size);
-      first_entry_index_with_this_product_size = entry_index;
-      product_size = first_file.entries[entry_index].product_size;
+      if (entry_index < num_entries) {
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
     }
   }
 
   return efficiency;
 }
 
+void dump_table_for_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  size_t i = 0;
+  size_triple_t min_product_size(first_file.entries.front().product_size);
+  size_triple_t max_product_size(first_file.entries.back().product_size);
+  if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
+    abort();
+  }
+  if (only_cubic_sizes) {
+    cerr << "Can't generate tables with --only-cubic-sizes." << endl;
+    abort();
+  }
+  cout << "struct LookupTable {" << endl;
+  cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
+  const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
+  const size_t TableSize = NumSizes * NumSizes * NumSizes;
+  cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
+  cout << "  static const unsigned short* Data() {" << endl;
+  cout << "    static const unsigned short data[" << TableSize << "] = {";
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float best_efficiency_this_product_size = 0.0f;
+      uint16_t best_block_size_this_product_size = 0;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        if (efficiency_this_entry > best_efficiency_this_product_size) {
+          best_efficiency_this_product_size = efficiency_this_entry;
+          best_block_size_this_product_size = first_file.entries[e].block_size;
+        }
+      }
+      if ((i++) % NumSizes) {
+        cout << " ";
+      } else {
+        cout << endl << "      ";
+      }
+      cout << "0x" << hex << best_block_size_this_product_size << dec;
+      if (entry_index < num_entries) {
+        cout << ",";
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+  if (i != TableSize) {
+    cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
+    abort();
+  }
+  cout << endl << "    };" << endl;
+  cout << "    return data;" << endl;
+  cout << "  }" << endl;
+  cout << "};" << endl;
+}
+
 float efficiency_of_partition(
         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
         const vector<vector<size_t>>& partition)
@@ -390,67 +578,299 @@ void print_partition(
     for (auto file = subset->begin(); file != subset->end(); ++file) {
       cout << "    " << preprocessed_inputfiles[*file].filename << endl;
     }
+    if (dump_tables) {
+      cout << "  Table:" << endl;
+      dump_table_for_subset(preprocessed_inputfiles, *subset);
+    }
   }
   cout << endl;
 }
 
-int main(int argc, char* argv[])
+struct action_t
 {
-  if (argc == 1) {
-    cerr << "usage: " << argv[0] << " [input files]" << endl;
-    cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
-    exit(1);
-  }
-  cout.precision(3);
-  cerr.precision(3);
-  vector<string> inputfilenames;
-  for (int i = 1; i < argc; i++) {
-  	inputfilenames.emplace_back(argv[i]);
-  }
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run(const vector<string>&) const { abort(); }
+  virtual ~action_t() {}
+};
 
-  vector<preprocessed_inputfile_t> preprocessed_inputfiles;
-  for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
-    preprocessed_inputfiles.emplace_back(inputfile_t(*it));
-  }
-
-  check_all_files_in_same_exact_order(preprocessed_inputfiles);
-
-  float required_efficiency_to_beat = 0.0f;
-  vector<vector<vector<size_t>>> partitions;
-  cerr << "searching for partitions...\r" << flush;
-  while (true)
+struct partition_action_t : action_t
+{
+  virtual const char* invokation_name() const override { return "partition"; }
+  virtual void run(const vector<string>& input_filenames) const override
   {
-    vector<vector<size_t>> partition;
-    find_partition_with_efficiency_higher_than(
-      preprocessed_inputfiles,
-      required_efficiency_to_beat,
-      partition);
-    float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
-    cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
-         << " subsets for " << 100.0f * actual_efficiency
-         << " % efficiency"
-         << "                  \r" << flush;
-    partitions.push_back(partition);
-    if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
-      break;
+    vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+
+    if (input_filenames.empty()) {
+      cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
+      exit(1);
     }
-    required_efficiency_to_beat = actual_efficiency;
-  }
-  cerr << "                                                                  " << endl;
-  while (true) {
-    bool repeat = false;
-    for (size_t i = 0; i < partitions.size() - 1; i++) {
-      if (partitions[i].size() >= partitions[i+1].size()) {
-        partitions.erase(partitions.begin() + i);
-        repeat = true;
+
+    for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
+      inputfile_t inputfile(*it);
+      switch (inputfile.type) {
+        case inputfile_t::type_t::all_pot_sizes:
+          preprocessed_inputfiles.emplace_back(inputfile);
+          break;
+        case inputfile_t::type_t::default_sizes:
+          cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
+               << "has no use for " << *it << " which contains measurements for default sizes." << endl;
+          exit(1);
+          break;
+        default:
+          cerr << "Unrecognized input file: " << *it << endl;
+          exit(1);
+      }
+    }
+
+    check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+    float required_efficiency_to_beat = 0.0f;
+    vector<vector<vector<size_t>>> partitions;
+    cerr << "searching for partitions...\r" << flush;
+    while (true)
+    {
+      vector<vector<size_t>> partition;
+      find_partition_with_efficiency_higher_than(
+        preprocessed_inputfiles,
+        required_efficiency_to_beat,
+        partition);
+      float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+      cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+           << " subsets for " << 100.0f * actual_efficiency
+           << " % efficiency"
+           << "                  \r" << flush;
+      partitions.push_back(partition);
+      if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+        break;
+      }
+      required_efficiency_to_beat = actual_efficiency;
+    }
+    cerr << "                                                                  " << endl;
+    while (true) {
+      bool repeat = false;
+      for (size_t i = 0; i < partitions.size() - 1; i++) {
+        if (partitions[i].size() >= partitions[i+1].size()) {
+          partitions.erase(partitions.begin() + i);
+          repeat = true;
+          break;
+        }
+      }
+      if (!repeat) {
         break;
       }
     }
-    if (!repeat) {
-      break;
+    for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+      print_partition(preprocessed_inputfiles, *it);
     }
   }
-  for (auto it = partitions.begin(); it != partitions.end(); ++it) {
-    print_partition(preprocessed_inputfiles, *it);
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+  struct results_entry_t {
+    uint16_t product_size;
+    size_triple_t default_block_size;
+    uint16_t best_pot_block_size;
+    float default_gflops;
+    float best_pot_gflops;
+    float default_efficiency;
+  };
+  friend ostream& operator<<(ostream& s, const results_entry_t& entry)
+  {
+    return s
+      << "Product size " << size_triple_t(entry.product_size)
+      << ": default block size " << entry.default_block_size
+      << " -> " << entry.default_gflops
+      << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
+      << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
+      << " -> " << entry.best_pot_gflops
+      << " GFlop/s" << dec;
   }
+  static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
+    return e1.default_efficiency < e2.default_efficiency;
+  }
+  virtual const char* invokation_name() const override { return "evaluate-defaults"; }
+  void show_usage_and_exit() const
+  {
+    cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
+    cerr << "checks how well the performance with default sizes compares to the best "
+         << "performance measured over all POT sizes." << endl;
+    exit(1);
+  }
+  virtual void run(const vector<string>& input_filenames) const override
+  {
+    if (input_filenames.size() != 2) {
+      show_usage_and_exit();
+    }
+    inputfile_t inputfile_default_sizes(input_filenames[0]);
+    inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
+    if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
+      cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
+      show_usage_and_exit();
+    }
+    if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
+      cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
+      show_usage_and_exit();
+    }
+    vector<results_entry_t> results;
+    vector<results_entry_t> cubic_results;
+    
+    uint16_t product_size = 0;
+    auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
+    for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
+         it_default_sizes != inputfile_default_sizes.entries.end();
+         ++it_default_sizes)
+    {
+      if (it_default_sizes->product_size == product_size) {
+        continue;
+      }
+      product_size = it_default_sizes->product_size;
+      while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
+             it_all_pot_sizes->product_size != product_size)
+      {
+        ++it_all_pot_sizes;
+      }
+      if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
+        break;
+      }
+      uint16_t best_pot_block_size = 0;
+      float best_pot_gflops = 0;
+      for (auto it = it_all_pot_sizes;
+           it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
+           ++it)
+      {
+        if (it->gflops > best_pot_gflops) {
+          best_pot_gflops = it->gflops;
+          best_pot_block_size = it->pot_block_size;
+        }
+      }
+      results_entry_t entry;
+      entry.product_size = product_size;
+      entry.default_block_size = it_default_sizes->nonpot_block_size;
+      entry.best_pot_block_size = best_pot_block_size;
+      entry.default_gflops = it_default_sizes->gflops;
+      entry.best_pot_gflops = best_pot_gflops;
+      entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
+      results.push_back(entry);
+
+      size_triple_t t(product_size);
+      if (t.k == t.m && t.m == t.n) {
+        cubic_results.push_back(entry);
+      }
+    }
+
+    cout << "All results:" << endl;
+    for (auto it = results.begin(); it != results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(results.begin(), results.end(), lower_efficiency);
+    
+    const size_t n = min<size_t>(20, results.size());
+    cout << n << " worst results:" << endl;
+    for (size_t i = 0; i < n; i++) {
+      cout << results[i] << endl;
+    }
+    cout << endl;
+
+    cout << "cubic results:" << endl;
+    for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
+      cout << *it << endl;
+    }
+    cout << endl;
+
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    
+    cout.precision(2);
+    vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
+    for (auto it = a.begin(); it != a.end(); ++it) {
+      size_t n = min(results.size() - 1, size_t(*it * results.size()));
+      cout << (100.0f * n / (results.size() - 1))
+           << " % of product sizes have default efficiency <= "
+           << 100.0f * results[n].default_efficiency << " %" << endl;
+    }
+    cout.precision(default_precision);
+  }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
+  cerr << "available actions:" << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  } 
+  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+  exit(1);
+}
+
+int main(int argc, char* argv[])
+{
+  cout.precision(default_precision);
+  cerr.precision(default_precision);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new partition_action_t);
+  available_actions.emplace_back(new evaluate_defaults_action_t);
+
+  vector<string> input_filenames;
+
+  action_t* action = nullptr;
+
+  if (argc < 2) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (int i = 1; i < argc; i++) {
+    bool arg_handled = false;
+    // Step 1. Try to match action invokation names.
+    for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+      if (!strcmp(argv[i], (*it)->invokation_name())) {
+        if (!action) {
+          action = it->get();
+          arg_handled = true;
+          break;
+        } else {
+          cerr << "can't specify more than one action!" << endl;
+          show_usage_and_exit(argc, argv, available_actions);
+        }
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 2. Try to match option names.
+    if (argv[i][0] == '-') {
+      if (!strcmp(argv[i], "--only-cubic-sizes")) {
+        only_cubic_sizes = true;
+        arg_handled = true;
+      }
+      if (!strcmp(argv[i], "--dump-tables")) {
+        dump_tables = true;
+        arg_handled = true;
+      }
+      if (!arg_handled) {
+        cerr << "Unrecognized option: " << argv[i] << endl;
+        show_usage_and_exit(argc, argv, available_actions);
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 3. Default to interpreting args as input filenames.
+    input_filenames.emplace_back(argv[i]);
+  }
+
+  if (dump_tables && only_cubic_sizes) {
+    cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  if (!action) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  action->run(input_filenames);
 }
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 04244575a..827be2880 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -1,11 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
 #include <vector>
 #include <fstream>
+#include <memory>
+#include <cstdio>
 
+bool eigen_use_specific_block_size;
 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
-#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
@@ -31,11 +43,15 @@ const float min_accurate_time = 1e-2f;
 // See --min-working-set-size command line parameter.
 size_t min_working_set_size = 0;
 
+float max_clock_speed = 0.0f;
+
 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
 const size_t maxsize = 2048;
 const size_t minsize = 16;
 
 typedef MatrixXf MatrixType;
+typedef MatrixType::Scalar Scalar;
+typedef internal::packet_traits<Scalar>::type Packet;
 
 static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
 static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
@@ -82,16 +98,26 @@ struct benchmark_t
 {
   uint16_t compact_product_size;
   uint16_t compact_block_size;
+  bool use_default_block_size;
   float gflops;
   benchmark_t()
     : compact_product_size(0)
     , compact_block_size(0)
+    , use_default_block_size(false)
     , gflops(0)
-  {}
+  {
+  }
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
+    , use_default_block_size(false)
+    , gflops(0)
+  {}
+  benchmark_t(size_t pk, size_t pm, size_t pn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(0)
+    , use_default_block_size(true)
     , gflops(0)
   {}
 
@@ -100,10 +126,15 @@ struct benchmark_t
 
 ostream& operator<<(ostream& s, const benchmark_t& b)
 {
-  s << hex;
-  s << b.compact_product_size
-    << " " << b.compact_block_size;
-  s << dec;
+  s << hex << b.compact_product_size << dec;
+  if (b.use_default_block_size) {
+    size_triple_t t(b.compact_product_size);
+    Index k = t.k, m = t.m, n = t.n;
+    internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
+    s << " default(" << k << ", " << m << ", " << n << ")";
+  } else {
+    s << " " << hex << b.compact_block_size << dec;
+  }
   s << " " << b.gflops;
   return s;
 }
@@ -121,19 +152,23 @@ bool operator<(const benchmark_t& b1, const benchmark_t& b2)
 
 void benchmark_t::run()
 {
-  // expand our compact benchmark params into proper triples
   size_triple_t productsizes(compact_product_size);
-  size_triple_t blocksizes(compact_block_size);
-  
-  // feed eigen with our custom blocking params
-  eigen_block_size_k = blocksizes.k;
-  eigen_block_size_m = blocksizes.m;
-  eigen_block_size_n = blocksizes.n;
+
+  if (use_default_block_size) {
+    eigen_use_specific_block_size = false;
+  } else {
+    // feed eigen with our custom blocking params
+    eigen_use_specific_block_size = true;
+    size_triple_t blocksizes(compact_block_size);
+    eigen_block_size_k = blocksizes.k;
+    eigen_block_size_m = blocksizes.m;
+    eigen_block_size_n = blocksizes.n;
+  }
 
   // set up the matrix pool
 
   const size_t combined_three_matrices_sizes =
-    sizeof(MatrixType::Scalar) *
+    sizeof(Scalar) *
       (productsizes.k * productsizes.m +
        productsizes.k * productsizes.n +
        productsizes.m * productsizes.n);
@@ -168,7 +203,7 @@ void benchmark_t::run()
 
     double starttime = timer.getCpuTime();
     for (int i = 0; i < iters_at_a_time; i++) {
-      dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index];
+      dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
       matrix_index++;
       if (matrix_index == matrix_pool_size) {
         matrix_index = 0;
@@ -231,9 +266,23 @@ string type_name<double>()
   return "double";
 }
 
-void show_usage_and_exit(const char *progname)
+struct action_t
 {
-  cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl;
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run() const { abort(); }
+  virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int /*argc*/, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+  cerr << "available actions:" << endl << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  }
+  cerr << endl;
+  cerr << "options:" << endl << endl;
   cerr << "  --min-working-set-size=N:" << endl;
   cerr << "       Set the minimum working set size to N bytes." << endl;
   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
@@ -244,93 +293,254 @@ void show_usage_and_exit(const char *progname)
   cerr << "       avoid warm caches." << endl;
   exit(1);
 }
-
-int main(int argc, char* argv[])
+     
+float measure_clock_speed()
 {
-  for (int i = 1; i < argc; i++) {
-    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
-      const char* equals_sign = strchr(argv[i], '=');
-      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
-    } else {
-      cerr << "unrecognized option: " << argv[i] << endl << endl;
-      show_usage_and_exit(argv[0]);
-    }
+  cerr << "Measuring clock speed...                              \r" << flush;
+          
+  vector<float> all_gflops;
+  for (int i = 0; i < 8; i++) {
+    benchmark_t b(1024, 1024, 1024);
+    b.run();
+    all_gflops.push_back(b.gflops);
   }
 
-  cout.precision(4);
+  sort(all_gflops.begin(), all_gflops.end());
+  float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
 
-  print_cpuinfo();
+  // multiply by an arbitrary constant to discourage trying doing anything with the
+  // returned values besides just comparing them with each other.
+  float result = stable_estimate * 123.456f;
 
-  cout << "benchmark parameters:" << endl;
-  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
-  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
-  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
-  cout << "minsize = " << minsize << endl;
-  cout << "maxsize = " << maxsize << endl;
-  cout << "measurement_repetitions = " << measurement_repetitions << endl;
-  cout << "min_accurate_time = " << min_accurate_time << endl;
-  cout << "min_working_set_size = " << min_working_set_size;
-  if (min_working_set_size == 0) {
-    cout << " (try to outsize caches)";
+  return result;
+}
+
+struct human_duration_t
+{
+  int seconds;
+  human_duration_t(int s) : seconds(s) {}
+};
+
+ostream& operator<<(ostream& s, const human_duration_t& d)
+{
+  int remainder = d.seconds;
+  if (remainder > 3600) {
+    int hours = remainder / 3600;
+    s << hours << " h ";
+    remainder -= hours * 3600;
   }
-  cout << endl << endl;
+  if (remainder > 60) {
+    int minutes = remainder / 60;
+    s << minutes << " min ";
+    remainder -= minutes * 60;
+  }
+  if (d.seconds < 600) {
+    s << remainder << " s";
+  }
+  return s;
+}
 
+const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
 
-  // assemble the array of benchmarks without running them at first
-  vector<benchmark_t> benchmarks;
-  for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
-    for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
-      for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
-        for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
-          for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
-            for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
-              for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
-                benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock);
-                benchmarks.push_back(b);
-              }
-            }
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "w");
+  if (!file) {
+    cerr << "Could not open file " << filename << " for writing." << endl;
+    cerr << "Do you have write permissions on the current working directory?" << endl;
+    exit(1);
+  }
+  size_t benchmarks_vector_size = benchmarks.size();
+  fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
+  fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
+  fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
+  fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
+  fclose(file);
+}
+
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return false;
+  }
+  if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
+    return false;
+  }
+  size_t benchmarks_vector_size = 0;
+  if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
+    return false;
+  }
+  if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
+    return false;
+  }
+  benchmarks.resize(benchmarks_vector_size);
+  if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
+    return false;
+  }
+  unlink(filename);
+  return true;
+}
+
+void try_run_some_benchmarks(
+  vector<benchmark_t>& benchmarks,
+  double time_start,
+  size_t& first_benchmark_to_run)
+{
+  if (first_benchmark_to_run == benchmarks.size()) {
+    return;
+  }
+
+  double time_last_progress_update = 0;
+  double time_last_clock_speed_measurement = 0;
+  double time_now = 0;
+
+  size_t benchmark_index = first_benchmark_to_run;
+
+  while (true) {
+    float ratio_done = float(benchmark_index) / benchmarks.size();
+    time_now = timer.getRealTime();
+
+    // We check clock speed every minute and at the end.
+    if (benchmark_index == benchmarks.size() ||
+        time_now > time_last_clock_speed_measurement + 60.0f)
+    {
+      time_last_clock_speed_measurement = time_now;
+
+      // Ensure that clock speed is as expected
+      float current_clock_speed = measure_clock_speed();
+
+      // The tolerance needs to be smaller than the relative difference between
+      // clock speeds that a device could operate under.
+      // It seems unlikely that a device would be throttling clock speeds by
+      // amounts smaller than 2%.
+      // With a value of 1%, I was getting within noise on a Sandy Bridge.
+      const float clock_speed_tolerance = 0.02f;
+
+      if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
+        // Clock speed is now higher than we previously measured.
+        // Either our initial measurement was inaccurate, which won't happen
+        // too many times as we are keeping the best clock speed value and
+        // and allowing some tolerance; or something really weird happened,
+        // which invalidates all benchmark results collected so far.
+        // Either way, we better restart all over again now.
+        if (benchmark_index) {
+          cerr << "Restarting at " << 100.0f * ratio_done
+               << " % because clock speed increased.          " << endl;
+        }
+        max_clock_speed = current_clock_speed;
+        first_benchmark_to_run = 0;
+        return;
+      }
+
+      bool rerun_last_tests = false;
+
+      if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+        cerr << "Measurements completed so far: "
+             << 100.0f * ratio_done
+             << " %                             " << endl;
+        cerr << "Clock speed seems to be only "
+             << current_clock_speed/max_clock_speed
+             << " times what it used to be." << endl;
+
+        unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
+
+        while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
+          if (seconds_to_sleep_if_lower_clock_speed > 32) {
+            cerr << "Sleeping longer probably won't make a difference." << endl;
+            cerr << "Serializing benchmarks to " << session_filename << endl;
+            serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
+            cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
+            exit(2);
           }
+          rerun_last_tests = true;
+          cerr << "Sleeping "
+               << seconds_to_sleep_if_lower_clock_speed
+               << " s...                                   \r" << endl;
+          sleep(seconds_to_sleep_if_lower_clock_speed);
+          current_clock_speed = measure_clock_speed();
+          seconds_to_sleep_if_lower_clock_speed *= 2;
         }
       }
+
+      if (rerun_last_tests) {
+        cerr << "Redoing the last "
+             << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+             << " % because clock speed had been low.   " << endl;
+        return;
+      }
+
+      // nothing wrong with the clock speed so far, so there won't be a need to rerun
+      // benchmarks run so far in case we later encounter a lower clock speed.
+      first_benchmark_to_run = benchmark_index;
     }
-  }
 
-  // randomly shuffling benchmarks allows us to get accurate enough progress info,
-  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
-  random_shuffle(benchmarks.begin(), benchmarks.end());
+    if (benchmark_index == benchmarks.size()) {
+      // We're done!
+      first_benchmark_to_run = benchmarks.size();
+      // Erase progress info
+      cerr << "                                                            " << endl;
+      return;
+    }
 
-  // timings here are only used to display progress info.
-  // Whence the use of real time.
-  double time_start = timer.getRealTime();
-  double time_last_progress_update = time_start;
-  for (size_t i = 0; i < benchmarks.size(); i++) {
     // Display progress info on stderr
-    double time_now = timer.getRealTime();
     if (time_now > time_last_progress_update + 1.0f) {
       time_last_progress_update = time_now;
-      float ratio_done = float(i) / benchmarks.size();
-      cerr.precision(3);
       cerr << "Measurements... " << 100.0f * ratio_done
-           << " %";
-
-      if (i > 10) {
-        cerr << ", ETA ";
-        float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done;
-        if (eta > 3600)
-          cerr << eta/3600 << " hours";
-        else if (eta > 60)
-          cerr << eta/60 << " minutes";
-        else cerr << eta << " seconds";
-      }
-      cerr << "                                              \r" << flush;
+           << " %, ETA "
+           << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
+           << "                          \r" << flush;
     }
 
     // This is where we actually run a benchmark!
-    benchmarks[i].run();
+    benchmarks[benchmark_index].run();
+    benchmark_index++;
+  }
+}
+
+void run_benchmarks(vector<benchmark_t>& benchmarks)
+{
+  size_t first_benchmark_to_run;
+  vector<benchmark_t> deserialized_benchmarks;
+  bool use_deserialized_benchmarks = false;
+  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
+    cerr << "Found serialized session with "
+         << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+         << " % already done" << endl;
+    if (deserialized_benchmarks.size() == benchmarks.size() &&
+        first_benchmark_to_run > 0 &&
+        first_benchmark_to_run < benchmarks.size())
+    {
+      use_deserialized_benchmarks = true;
+    }
   }
 
-  // Erase progress info
-  cerr << "                                                            " << endl;
+  if (use_deserialized_benchmarks) {
+    benchmarks = deserialized_benchmarks;
+  } else {
+    // not using deserialized benchmarks, starting from scratch
+    first_benchmark_to_run = 0;
+
+    // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+    // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+    // It also means that if data is corrupted for some time span, the odds are that
+    // not all repetitions of a given benchmark will be corrupted.
+    random_shuffle(benchmarks.begin(), benchmarks.end());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    max_clock_speed = max(max_clock_speed, measure_clock_speed());
+  }
+  
+  double time_start = 0.0;
+  while (first_benchmark_to_run < benchmarks.size()) {
+    if (first_benchmark_to_run == 0) {
+      time_start = timer.getRealTime();
+    }
+    try_run_some_benchmarks(benchmarks,
+                            time_start,
+                            first_benchmark_to_run);
+  }
 
   // Sort timings by increasing benchmark parameters, and decreasing gflops.
   // The latter is very important. It means that we can ignore all but the first
@@ -348,9 +558,120 @@ int main(int argc, char* argv[])
     }
   }
 
-  // Output data.
-  cout << "BEGIN MEASUREMENTS" << endl;
-  for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) {
-    cout << *it << endl;
-  }
+  // keep and return only the best benchmarks
+  benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "all-pot-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+              for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+                for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+                  benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "default-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            benchmarks.emplace_back(ksize, msize, nsize);
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  double time_start = timer.getRealTime();
+  cout.precision(4);
+  cerr.precision(4);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+  available_actions.emplace_back(new measure_default_sizes_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 1) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  for (int i = 2; i < argc; i++) {
+    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+      const char* equals_sign = strchr(argv[i], '=');
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+    } else {
+      cerr << "unrecognized option: " << argv[i] << endl << endl;
+      show_usage_and_exit(argc, argv, available_actions);
+    }
+  }
+
+  print_cpuinfo();
+
+  cout << "benchmark parameters:" << endl;
+  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "scalar type: " << type_name<Scalar>() << endl;
+  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+  cout << "minsize = " << minsize << endl;
+  cout << "maxsize = " << maxsize << endl;
+  cout << "measurement_repetitions = " << measurement_repetitions << endl;
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
+    cout << " (try to outsize caches)";
+  }
+  cout << endl << endl;
+
+  (*action)->run();
+
+  double time_end = timer.getRealTime();
+  cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
 }
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index f19b4287d..40a71c781 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -1,39 +1,45 @@
-3.0.1
-3.1.1
-3.2.0
+#3.0.1
+#3.1.1
+#3.2.0
 3.2.4
-5745:37f59e65eb6c
-5891:d8652709345d
-5893:24b4dc92c6d3
-5895:997c2ef9fc8b
-5904:e1eafd14eaa1
-5908:f8ee3c721251
-5921:ca808bb456b0
-5927:8b1001f9e3ac
-5937:5a4ca1ad8c53
-5949:f3488f4e45b2
-5969:e09031dccfd9
-5992:4a429f5e0483
+#5745:37f59e65eb6c
+5891:d8652709345d  # introduce AVX
+#5893:24b4dc92c6d3  # merge
+5895:997c2ef9fc8b  # introduce FMA
+#5904:e1eafd14eaa1  # complex and AVX
+5908:f8ee3c721251  # improve packing with ptranspose
+#5921:ca808bb456b0  # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2  # merge
+#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483  # merge
 before-evaluators
-6334:f6a45e5b8b7c
-6639:c9121c60b5c7
-6655:06f163b5221f
-6677:700e023044e7   # FMA has been wrongly disabled
-6681:11d31dafb0e3
-6699:5e6e8e10aad1   # merge default to tensors
-6726:ff2d2388e7b9   # merge default to tensors
-6742:0cbd6195e829   # merge default to tensors
-6747:853d2bafeb8f   # Generalized the gebp apis
+#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f  # Properly detect FMA support on ARM
+#6677:700e023044e7   # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1   # merge default to tensors
+#6726:ff2d2388e7b9   # merge default to tensors
+#6742:0cbd6195e829   # merge default to tensors
+#6747:853d2bafeb8f   # Generalized the gebp apis
 6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
-6781:9cc5a931b2c6   # generalized gemv
-6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
-6844:039efd86b75c   # merge tensor
+#6781:9cc5a931b2c6   # generalized gemv
+#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c   # merge tensor
 6845:7333ed40c6ef   # change prefetching in gebp
-6856:b5be5e10eb7f   # merge index conversion
-6893:c3a64aba7c70   # clean blocking size computation
-6898:6fb31ebe6492   # rotating kernel for ARM
+#6856:b5be5e10eb7f   # merge index conversion
+#6893:c3a64aba7c70   # clean blocking size computation
+#6898:6fb31ebe6492   # rotating kernel for ARM
 6899:877facace746   # rotating kernel for ARM only
-6904:c250623ae9fa   # result_of
+#6904:c250623ae9fa   # result_of
 6921:915f1b1fc158   # fix prefetching change for ARM
 6923:9ff25f6dacc6   # prefetching
-6933:52572e60b5d3   # blocking size strategy
\ No newline at end of file
+6933:52572e60b5d3   # blocking size strategy
+6937:c8c042f286b2   # avoid redundant pack_rhs
+6981:7e5d6f78da59   # dynamic loop swapping
+6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+
diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh
index d3a9fadc9..3fa6a3661 100755
--- a/bench/perf_monitoring/gemm/run_gemm.sh
+++ b/bench/perf_monitoring/gemm/run_gemm.sh
@@ -6,6 +6,7 @@
 
 # Options:
 #   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
 
 
 if echo "$*" | grep '\-up' > /dev/null; then
@@ -14,14 +15,30 @@ else
   update=false
 fi
 
-if [ $update == true ]; then
+if echo "$*" | grep '\-s' > /dev/null; then
+  selected=true
+else
+  selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
  echo "(Re-)Compute all changesets and keep bests"
 else
  echo "Skip previously computed changesets"
 fi
 
+
+
 if [ ! -d "eigen_src" ]; then
   hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+  cd eigen_src
+  hg pull -u
+  cd ..
 fi
 
 if [ ! -z '$CXX' ]; then
@@ -61,17 +78,31 @@ function test_current
   scalar=$2
   name=$3
   
-  prev=`grep $rev "$name.backup" | cut -c 14-`
+  prev=""
+  if [ -e "$name.backup" ]; then
+    prev=`grep $rev "$name.backup" | cut -c 14-`
+  fi
   res=$prev
   count_rev=`echo $prev |  wc -w`
   count_ref=`cat "settings.txt" |  wc -l`
-  if [ $update == true ] || [ $count_rev != $count_ref ]; then
+  if echo "$global_args" | grep "$rev" > /dev/null; then
+    rev_found=true
+  else
+    rev_found=false
+  fi
+#  echo $update et $selected et $rev_found because $rev et "$global_args"
+#  echo $count_rev et $count_ref
+  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
     if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
       curr=`./$name`
-      echo merge $prev
-      echo with  $curr
+      if [ $count_rev == $count_ref ]; then
+        echo "merge previous $prev"
+        echo "with new       $curr"
+      else
+        echo "got            $curr"
+      fi
       res=`merge "$curr" "$prev"`
-      echo $res
+#       echo $res
       echo "$rev $res" >> $name.out
     else
       echo "Compilation failed, skip rev $rev"
@@ -86,12 +117,12 @@ make_backup $PREFIX"sgemm"
 make_backup $PREFIX"dgemm"
 make_backup $PREFIX"cgemm"
 
-cut -f1 -d"#" < changesets.txt | while read rev
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
 do
   if [ ! -z '$rev' ]; then
     echo "Testing rev $rev"
     cd eigen_src
-    hg up -C $rev
+    hg up -C $rev > /dev/null
     actual_rev=`hg identify | cut -f1 -d' '`
     cd ..
     
diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt
index 6ef690708..5c43e1c7d 100644
--- a/bench/perf_monitoring/gemm/settings.txt
+++ b/bench/perf_monitoring/gemm/settings.txt
@@ -1,5 +1,6 @@
 8 8 8
 9 9 9
+24 24 24
 239 239 239
 240 240 240
 2400 24 24
@@ -8,4 +9,7 @@
 24 2400 2400
 2400 24 2400
 2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
 2400 2400 2400
diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
index 8775b88cd..c373e8699 100644
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@@ -1,7 +1,7 @@
 
 #include <stdio.h>
 
-#if (defined __GNUC__) && (!defined __MINGW32__)
+#if (defined __GNUC__) && (!defined __MINGW32__) && (!defined __CYGWIN__)
 #define EIGEN_WEAK_LINKING __attribute__ ((weak))
 #else
 #define EIGEN_WEAK_LINKING
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index b4ab95dbc..f8aec777d 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -502,6 +502,10 @@ macro(ei_set_build_string)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit)
   endif()
   
+  if(EIGEN_TEST_CXX11)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
+  endif()
+  
   if(EIGEN_BUILD_STRING_SUFFIX)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
   endif()
diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox
index 0850863aa..cb25f4ec9 100644
--- a/doc/CustomizingEigen.dox
+++ b/doc/CustomizingEigen.dox
@@ -157,6 +157,67 @@ inline adouble abs2(const adouble& x)  { return x*x; }
 #endif // ADOLCSUPPORT_H
 \endcode
 
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<class> struct NumTraits;
+  template<> struct NumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+    template<>
+      struct significant_decimals_impl<mpq_class>
+      {
+	// Infinite precision when printing
+	static inline int run() { return 0; }
+      };
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+	std::size_t len;
+	result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+	result_type(mpq_class const& q) :
+	  len(mpz_size(q.get_num_mpz_t())+
+	      mpz_size(q.get_den_mpz_t())-1) {}
+	friend bool operator<(result_type x, result_type y) {
+	  // 0 is the worst possible pivot
+	  if (x.len == 0) return y.len > 0;
+	  if (y.len == 0) return false;
+	  // Prefer a pivot with a small representation
+	  return x.len > y.len;
+	}
+	friend bool operator==(result_type x, result_type y) {
+	  // Only used to test if the score is 0
+	  return x.len == y.len;
+	}
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
 
 \sa \ref TopicPreprocessorDirectives
 
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 147b55376..13741280a 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -21,6 +21,9 @@ They are summarized in the following table:
 <tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
+<tr><td>LSCG</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
+    <td>built-in, MPL2</td>
+    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
 <tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
index c8795a344..d3e82ecd9 100644
--- a/failtest/CMakeLists.txt
+++ b/failtest/CMakeLists.txt
@@ -47,6 +47,18 @@ ei_add_failtest("sparse_ref_3")
 ei_add_failtest("sparse_ref_4")
 ei_add_failtest("sparse_ref_5")
 
+ei_add_failtest("partialpivlu_int")
+ei_add_failtest("fullpivlu_int")
+ei_add_failtest("llt_int")
+ei_add_failtest("ldlt_int")
+ei_add_failtest("qr_int")
+ei_add_failtest("colpivqr_int")
+ei_add_failtest("fullpivqr_int")
+ei_add_failtest("jacobisvd_int")
+ei_add_failtest("bdcsvd_int")
+ei_add_failtest("eigensolver_int")
+ei_add_failtest("eigensolver_cplx")
+
 if (EIGEN_FAILTEST_FAILURE_COUNT)
   message(FATAL_ERROR
           "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. "
diff --git a/failtest/bdcsvd_int.cpp b/failtest/bdcsvd_int.cpp
new file mode 100644
index 000000000..670752cf5
--- /dev/null
+++ b/failtest/bdcsvd_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  BDCSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/colpivqr_int.cpp b/failtest/colpivqr_int.cpp
new file mode 100644
index 000000000..db11910d4
--- /dev/null
+++ b/failtest/colpivqr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  ColPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/eigensolver_cplx.cpp b/failtest/eigensolver_cplx.cpp
new file mode 100644
index 000000000..c2e21e189
--- /dev/null
+++ b/failtest/eigensolver_cplx.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR std::complex<double>
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/eigensolver_int.cpp b/failtest/eigensolver_int.cpp
new file mode 100644
index 000000000..eda8dc20b
--- /dev/null
+++ b/failtest/eigensolver_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/fullpivlu_int.cpp b/failtest/fullpivlu_int.cpp
new file mode 100644
index 000000000..e9d2c6eb3
--- /dev/null
+++ b/failtest/fullpivlu_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/fullpivqr_int.cpp b/failtest/fullpivqr_int.cpp
new file mode 100644
index 000000000..d182a7b6b
--- /dev/null
+++ b/failtest/fullpivqr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/jacobisvd_int.cpp b/failtest/jacobisvd_int.cpp
new file mode 100644
index 000000000..12790aef1
--- /dev/null
+++ b/failtest/jacobisvd_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  JacobiSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/ldlt_int.cpp b/failtest/ldlt_int.cpp
new file mode 100644
index 000000000..243e45746
--- /dev/null
+++ b/failtest/ldlt_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LDLT<Matrix<SCALAR,Dynamic,Dynamic> > ldlt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/llt_int.cpp b/failtest/llt_int.cpp
new file mode 100644
index 000000000..cb020650d
--- /dev/null
+++ b/failtest/llt_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LLT<Matrix<SCALAR,Dynamic,Dynamic> > llt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/partialpivlu_int.cpp b/failtest/partialpivlu_int.cpp
new file mode 100644
index 000000000..98ef282ea
--- /dev/null
+++ b/failtest/partialpivlu_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  PartialPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/qr_int.cpp b/failtest/qr_int.cpp
new file mode 100644
index 000000000..ce200e818
--- /dev/null
+++ b/failtest/qr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  HouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/scripts/buildtests.in b/scripts/buildtests.in
index 7026373cf..d2fd10276 100755
--- a/scripts/buildtests.in
+++ b/scripts/buildtests.in
@@ -14,9 +14,9 @@ targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo`
 
 if [ -n "${EIGEN_MAKE_ARGS:+x}" ]
 then
-  make $targets_to_make ${EIGEN_MAKE_ARGS}
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make ${EIGEN_MAKE_ARGS}
 else
-  make $targets_to_make
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make @EIGEN_TEST_BUILD_FLAGS@
 fi
 exit $?
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 168749634..393c35b57 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -139,6 +139,7 @@ endif(TEST_LIB)
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
 
+ei_add_test(rand)
 ei_add_test(meta)
 ei_add_test(sizeof)
 ei_add_test(dynalloc)
@@ -226,6 +227,7 @@ ei_add_test(stdvector_overload)
 ei_add_test(stdlist)
 ei_add_test(stddeque)
 ei_add_test(sparse_basic)
+ei_add_test(sparse_block)
 ei_add_test(sparse_vector)
 ei_add_test(sparse_product)
 ei_add_test(sparse_ref)
@@ -234,6 +236,7 @@ ei_add_test(sparse_permutations)
 ei_add_test(simplicial_cholesky)
 ei_add_test(conjugate_gradient)
 ei_add_test(bicgstab)
+ei_add_test(lscg)
 ei_add_test(sparselu)
 ei_add_test(sparseqr)
 ei_add_test(umeyama)
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index f327e2fac..7a9a11330 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -10,11 +10,11 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_bicgstab_T()
+template<typename T, typename I> void test_bicgstab_T()
 {
-  BiCGSTAB<SparseMatrix<T>, DiagonalPreconditioner<T> > bicgstab_colmajor_diag;
-  BiCGSTAB<SparseMatrix<T>, IdentityPreconditioner    > bicgstab_colmajor_I;
-  BiCGSTAB<SparseMatrix<T>, IncompleteLUT<T> >           bicgstab_colmajor_ilut;
+  BiCGSTAB<SparseMatrix<T,0,I>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
+  BiCGSTAB<SparseMatrix<T,0,I>, IdentityPreconditioner    >     bicgstab_colmajor_I;
+  BiCGSTAB<SparseMatrix<T,0,I>, IncompleteLUT<T,I> >              bicgstab_colmajor_ilut;
   //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
 
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag)  );
@@ -25,6 +25,7 @@ template<typename T> void test_bicgstab_T()
 
 void test_bicgstab()
 {
-  CALL_SUBTEST_1(test_bicgstab_T<double>());
-  CALL_SUBTEST_2(test_bicgstab_T<std::complex<double> >());
+  CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
+  CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_bicgstab_T<double,long int>()));
 }
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 019cc4d64..9622fd86d 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -10,13 +10,14 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_conjugate_gradient_T()
+template<typename T, typename I> void test_conjugate_gradient_T()
 {
-  ConjugateGradient<SparseMatrix<T>, Lower      > cg_colmajor_lower_diag;
-  ConjugateGradient<SparseMatrix<T>, Upper      > cg_colmajor_upper_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
-  ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrixType, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrixType, Lower|Upper> cg_colmajor_loup_diag;
+  ConjugateGradient<SparseMatrixType, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
+  ConjugateGradient<SparseMatrixType, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
@@ -27,6 +28,7 @@ template<typename T> void test_conjugate_gradient_T()
 
 void test_conjugate_gradient()
 {
-  CALL_SUBTEST_1(test_conjugate_gradient_T<double>());
-  CALL_SUBTEST_2(test_conjugate_gradient_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
+  CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_conjugate_gradient_T<double,long int>() ));
 }
diff --git a/test/lscg.cpp b/test/lscg.cpp
new file mode 100644
index 000000000..daa62a954
--- /dev/null
+++ b/test/lscg.cpp
@@ -0,0 +1,29 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T> void test_lscg_T()
+{
+  LeastSquaresConjugateGradient<SparseMatrix<T> > lscg_colmajor_diag;
+  LeastSquaresConjugateGradient<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
+
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I)     );
+  
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I)     );
+}
+
+void test_lscg()
+{
+  CALL_SUBTEST_1(test_lscg_T<double>());
+  CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
+}
diff --git a/test/main.h b/test/main.h
index 1f937690c..ecf0c6924 100644
--- a/test/main.h
+++ b/test/main.h
@@ -42,13 +42,19 @@
 #include <deque>
 #include <queue>
 #include <list>
+#if __cplusplus >= 201103L
+#include <random>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+#endif
 
 // To test that all calls from Eigen code to std::min() and std::max() are
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-//#define min(A,B) please_protect_your_min_with_parentheses
-//#define max(A,B) please_protect_your_max_with_parentheses
+#define min(A,B) please_protect_your_min_with_parentheses
+#define max(A,B) please_protect_your_max_with_parentheses
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index 744a1ef7f..1b4c6c33c 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -109,8 +109,33 @@ void mat_mat_scalar_scalar_product()
   double det = 6.0, wt = 0.5;
   VERIFY_IS_APPROX(dNdxy.transpose()*dNdxy*det*wt, det*wt*dNdxy.transpose()*dNdxy);
 }
+
+template <typename MatrixType> 
+void zero_sized_objects(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
   
-void zero_sized_objects()
+  {
+    MatrixType res, a(rows,0), b(0,cols);
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(rows,cols) );
+    VERIFY_IS_APPROX( (res=a*a.transpose()), MatrixType::Zero(rows,rows) );
+    VERIFY_IS_APPROX( (res=b.transpose()*b), MatrixType::Zero(cols,cols) );
+    VERIFY_IS_APPROX( (res=b.transpose()*a.transpose()), MatrixType::Zero(cols,rows) );
+  }
+  
+  {
+    MatrixType res, a(rows,cols), b(cols,0);
+    res = a*b;
+    VERIFY(res.rows()==rows && res.cols()==0);
+    b.resize(0,rows);
+    res = b*a;
+    VERIFY(res.rows()==0 && res.cols()==cols);
+  }
+}
+
+
+void bug_127()
 {
   // Bug 127
   //
@@ -171,7 +196,8 @@ void test_product_extra()
     CALL_SUBTEST_2( mat_mat_scalar_scalar_product() );
     CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
-  CALL_SUBTEST_5( zero_sized_objects() );
+  CALL_SUBTEST_5( bug_127() );
   CALL_SUBTEST_6( unaligned_objects() );
 }
diff --git a/test/rand.cpp b/test/rand.cpp
new file mode 100644
index 000000000..7c8068a3b
--- /dev/null
+++ b/test/rand.cpp
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename Scalar> Scalar check_in_range(Scalar x, Scalar y)
+{
+  Scalar r = internal::random<Scalar>(x,y);
+  VERIFY(r>=x);
+  if(y>=x)
+  {
+    VERIFY(r<=y);
+  }
+  return r;
+}
+
+template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
+{
+  Array<int,1,Dynamic> mask(y-x+1);
+  mask.fill(0);
+  long n = (y-x+1)*32;
+  for(long k=0; k<n; ++k)
+  {
+    mask( check_in_range(x,y)-x )++;
+  }
+  VERIFY( (mask>0).all() );
+}
+
+void test_rand()
+{
+  long long_ref = NumTraits<long>::highest()/10;
+  char char_offset = (std::min)(g_repeat,64);
+  char short_offset = (std::min)(g_repeat,16000);
+  
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST(check_in_range<float>(10,11));
+    CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<float>(-1,1));
+    CALL_SUBTEST(check_in_range<float>(-1432.2352,-1432.2352));
+    
+    CALL_SUBTEST(check_in_range<double>(10,11));
+    CALL_SUBTEST(check_in_range<double>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<double>(-1,1));
+    CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
+    
+    CALL_SUBTEST(check_in_range<int>(0,-1));
+    CALL_SUBTEST(check_in_range<short>(0,-1));
+    CALL_SUBTEST(check_in_range<long>(0,-1));
+    CALL_SUBTEST(check_in_range<int>(-673456,673456));
+    CALL_SUBTEST(check_in_range<short>(-24345,24345));
+    CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
+  }
+  
+  CALL_SUBTEST(check_all_in_range<char>(11,11));
+  CALL_SUBTEST(check_all_in_range<char>(11,11+char_offset));
+  CALL_SUBTEST(check_all_in_range<char>(-5,5));
+  CALL_SUBTEST(check_all_in_range<char>(-11-char_offset,-11));
+  CALL_SUBTEST(check_all_in_range<char>(-126,-126+char_offset));
+  CALL_SUBTEST(check_all_in_range<char>(126-char_offset,126));
+  CALL_SUBTEST(check_all_in_range<char>(-126,126));
+  
+  CALL_SUBTEST(check_all_in_range<short>(11,11));
+  CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(-5,5));
+  CALL_SUBTEST(check_all_in_range<short>(-11-short_offset,-11));
+  CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
+  
+  CALL_SUBTEST(check_all_in_range<int>(11,11));
+  CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(-5,5));
+  CALL_SUBTEST(check_all_in_range<int>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<int>(-673456,-673456+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(673456,673456+g_repeat));
+  
+  CALL_SUBTEST(check_all_in_range<long>(11,11));
+  CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(-5,5));
+  CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<long>(-long_ref,-long_ref+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>( long_ref, long_ref+g_repeat));
+}
diff --git a/test/ref.cpp b/test/ref.cpp
index b9470213c..fbe2c450f 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -228,6 +228,28 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0);
 }
 
+typedef Matrix<double,Dynamic,Dynamic,RowMajor> RowMatrixXd;
+int test_ref_overload_fun1(Ref<MatrixXd> )       { return 1; }
+int test_ref_overload_fun1(Ref<RowMatrixXd> )    { return 2; }
+int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
+
+int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
+int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
+
+// See also bug 969
+void test_ref_overloads()
+{
+  MatrixXd Ad, Bd;
+  RowMatrixXd rAd, rBd;
+  VERIFY( test_ref_overload_fun1(Ad)==1 );
+  VERIFY( test_ref_overload_fun1(rAd)==2 );
+  
+  MatrixXf Af, Bf;
+  VERIFY( test_ref_overload_fun2(Ad)==4 );
+  VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
+  VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
+}
+
 void test_ref()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -248,4 +270,6 @@ void test_ref()
     CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
     CALL_SUBTEST_6( call_ref() );
   }
+  
+  CALL_SUBTEST_7( test_ref_overloads() );
 }
diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
index 786468421..b7cc2d351 100644
--- a/test/simplicial_cholesky.cpp
+++ b/test/simplicial_cholesky.cpp
@@ -9,16 +9,17 @@
 
 #include "sparse_solver.h"
 
-template<typename T> void test_simplicial_cholesky_T()
+template<typename T, typename I> void test_simplicial_cholesky_T()
 {
-  SimplicialCholesky<SparseMatrix<T>, Lower> chol_colmajor_lower_amd;
-  SimplicialCholesky<SparseMatrix<T>, Upper> chol_colmajor_upper_amd;
-  SimplicialLLT<SparseMatrix<T>, Lower> llt_colmajor_lower_amd;
-  SimplicialLLT<SparseMatrix<T>, Upper> llt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower> ldlt_colmajor_lower_amd;
-  SimplicialLDLT<SparseMatrix<T>, Upper> ldlt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower, NaturalOrdering<int> > ldlt_colmajor_lower_nat;
-  SimplicialLDLT<SparseMatrix<T>, Upper, NaturalOrdering<int> > ldlt_colmajor_upper_nat;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  SimplicialCholesky<SparseMatrixType, Lower> chol_colmajor_lower_amd;
+  SimplicialCholesky<SparseMatrixType, Upper> chol_colmajor_upper_amd;
+  SimplicialLLT<     SparseMatrixType, Lower> llt_colmajor_lower_amd;
+  SimplicialLLT<     SparseMatrixType, Upper> llt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower> ldlt_colmajor_lower_amd;
+  SimplicialLDLT<    SparseMatrixType, Upper> ldlt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I> > ldlt_colmajor_lower_nat;
+  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I> > ldlt_colmajor_upper_nat;
 
   check_sparse_spd_solving(chol_colmajor_lower_amd);
   check_sparse_spd_solving(chol_colmajor_upper_amd);
@@ -40,6 +41,7 @@ template<typename T> void test_simplicial_cholesky_T()
 
 void test_simplicial_cholesky()
 {
-  CALL_SUBTEST_1(test_simplicial_cholesky_T<double>());
-  CALL_SUBTEST_2(test_simplicial_cholesky_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_simplicial_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_simplicial_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_simplicial_cholesky_T<double,long int>() ));
 }
diff --git a/test/sparse.h b/test/sparse.h
index 81ab9e873..3c3a0c9be 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -53,15 +53,15 @@ enum {
  * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero,
  *        and zero coefficients respectively.
  */
-template<typename Scalar,int Opt1,int Opt2,typename Index> void
+template<typename Scalar,int Opt1,int Opt2,typename StorageIndex> void
 initSparse(double density,
            Matrix<Scalar,Dynamic,Dynamic,Opt1>& refMat,
-           SparseMatrix<Scalar,Opt2,Index>& sparseMat,
+           SparseMatrix<Scalar,Opt2,StorageIndex>& sparseMat,
            int flags = 0,
-           std::vector<Matrix<Index,2,1> >* zeroCoords = 0,
-           std::vector<Matrix<Index,2,1> >* nonzeroCoords = 0)
+           std::vector<Matrix<StorageIndex,2,1> >* zeroCoords = 0,
+           std::vector<Matrix<StorageIndex,2,1> >* nonzeroCoords = 0)
 {
-  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,Index>::IsRowMajor };
+  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,StorageIndex>::IsRowMajor };
   sparseMat.setZero();
   //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
   sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows()))));
@@ -93,11 +93,11 @@ initSparse(double density,
         //sparseMat.insertBackByOuterInner(j,i) = v;
         sparseMat.insertByOuterInner(j,i) = v;
         if (nonzeroCoords)
-          nonzeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+          nonzeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       else if (zeroCoords)
       {
-        zeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+        zeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       refMat(ai,aj) = v;
     }
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 8021f4db6..75f29a2b4 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -9,6 +9,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
 #include "sparse.h"
 
 template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
@@ -55,48 +58,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
 
     VERIFY_IS_APPROX(m, refMat);
 
-      // test InnerIterators and Block expressions
-      for (Index t=0; t<10; ++t)
-      {
-        Index j = internal::random<Index>(0,cols-1);
-        Index i = internal::random<Index>(0,rows-1);
-        Index w = internal::random<Index>(1,cols-j-1);
-        Index h = internal::random<Index>(1,rows-i-1);
-
-         VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
-        for(Index c=0; c<w; c++)
-        {
-          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
-          for(Index r=0; r<h; r++)
-          {
-            // FIXME col().coeff() not implemented yet
-//             VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
-          }
-        }
-         for(Index r=0; r<h; r++)
-         {
-           VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
-           for(Index c=0; c<w; c++)
-           {
-             // FIXME row().coeff() not implemented yet
-//             VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
-           }
-         }
-      }
-
-      for(Index c=0; c<cols; c++)
-      {
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
-      }
-
-      for(Index r=0; r<rows; r++)
-      {
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
-      }
-      
-
       // test assertion
       VERIFY_RAISES_ASSERT( m.coeffRef(-1,1) = 0 );
       VERIFY_RAISES_ASSERT( m.coeffRef(0,m.cols()) = 0 );
@@ -107,17 +68,31 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       DenseMatrix m1(rows,cols);
       m1.setZero();
       SparseMatrixType m2(rows,cols);
-      if(internal::random<int>()%2)
-        m2.reserve(VectorXi::Constant(m2.outerSize(), 2));
+      bool call_reserve = internal::random<int>()%2;
+      Index nnz = internal::random<int>(1,int(rows)/2);
+      if(call_reserve)
+      {
+        if(internal::random<int>()%2)
+          m2.reserve(VectorXi::Constant(m2.outerSize(), int(nnz)));
+        else
+          m2.reserve(m2.outerSize() * nnz);
+      }
+      g_realloc_count = 0;
       for (Index j=0; j<cols; ++j)
       {
-        for (Index k=0; k<rows/2; ++k)
+        for (Index k=0; k<nnz; ++k)
         {
           Index i = internal::random<Index>(0,rows-1);
           if (m1.coeff(i,j)==Scalar(0))
             m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
         }
       }
+      
+      if(call_reserve && !SparseMatrixType::IsRowMajor)
+      {
+        VERIFY(g_realloc_count==0);
+      }
+      
       m2.finalize();
       VERIFY_IS_APPROX(m2,m1);
     }
@@ -167,82 +142,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       VERIFY_IS_APPROX(m2,m1);
     }
 
-  // test innerVector()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,outer-1);
-    Index j1 = internal::random<Index>(0,outer-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
-
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
-
-    SparseMatrixType m3(rows,cols);
-    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
-    for(Index j=0; j<outer; ++j)
-      for(Index k=0; k<(std::min)(j,inner); ++k)
-        m3.insertByOuterInner(j,k) = k+1;
-    for(Index j=0; j<(std::min)(outer, inner); ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-    m3.makeCompressed();
-    for(Index j=0; j<(std::min)(outer, inner); ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-
-    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
-
-//     m2.innerVector(j0) = 2*m2.innerVector(j1);
-//     refMat2.col(j0) = 2*refMat2.col(j1);
-//     VERIFY_IS_APPROX(m2, refMat2);
-  }
-
-  // test innerVectors()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
-    Index j0 = internal::random<Index>(0,outer-2);
-    Index j1 = internal::random<Index>(0,outer-2);
-    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-    
-    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
-    
-    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
-    if(SparseMatrixType::IsRowMajor)
-      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
-    else
-      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-  }
-
   // test basic computations
   {
     DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
@@ -313,40 +212,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY(m2.isApprox(m3));
   }
 
-  
-  
-  // test generic blocks
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,outer-2);
-    Index j1 = internal::random<Index>(0,outer-2);
-    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
-    
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
-                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-      
-    Index i = internal::random<Index>(0,m2.outerSize()-1);
-    if(SparseMatrixType::IsRowMajor) {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.row(i) = refMat2.row(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    } else {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.col(i) = refMat2.col(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    }
-  }
-
   // test prune
   {
     SparseMatrixType m2(rows, cols);
@@ -575,7 +440,7 @@ void big_sparse_triplet(Index rows, Index cols, double density) {
 void test_sparse_basic()
 {
   for(int i = 0; i < g_repeat; i++) {
-    int r = Eigen::internal::random<int>(1,100), c = Eigen::internal::random<int>(1,100);
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
     if(Eigen::internal::random<int>(0,4) == 0) {
       r = c; // check square matrices in 25% of tries
     }
@@ -585,11 +450,17 @@ void test_sparse_basic()
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(r, c)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
     
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
   }
 
   // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
new file mode 100644
index 000000000..8a6e0687c
--- /dev/null
+++ b/test/sparse_block.cpp
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+
+template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  const Index inner = ref.innerSize();
+  const Index outer = ref.outerSize();
+
+  typedef typename SparseMatrixType::Scalar Scalar;
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+
+  Scalar s1 = internal::random<Scalar>();
+  {
+    SparseMatrixType m(rows, cols);
+    DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+    initSparse<Scalar>(density, refMat, m);
+
+    VERIFY_IS_APPROX(m, refMat);
+
+    // test InnerIterators and Block expressions
+    for (int t=0; t<10; ++t)
+    {
+      Index j = internal::random<Index>(0,cols-2);
+      Index i = internal::random<Index>(0,rows-2);
+      Index w = internal::random<Index>(1,cols-j);
+      Index h = internal::random<Index>(1,rows-i);
+
+      VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      
+      VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
+      VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
+        VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
+          VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
+          
+          VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
+          VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
+          }
+          if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          }
+        }
+      }
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
+        VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
+      }
+    }
+
+    for(Index c=0; c<cols; c++)
+    {
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
+    }
+
+    for(Index r=0; r<rows; r++)
+    {
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
+    }
+  }
+
+  // test innerVector()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-1);
+    Index j1 = internal::random<Index>(0,outer-1);
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
+    else
+      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
+
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
+    else
+      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
+
+    SparseMatrixType m3(rows,cols);
+    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
+    for(Index j=0; j<outer; ++j)
+      for(Index k=0; k<(std::min)(j,inner); ++k)
+        m3.insertByOuterInner(j,k) = k+1;
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+    m3.makeCompressed();
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+
+    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
+
+//     m2.innerVector(j0) = 2*m2.innerVector(j1);
+//     refMat2.col(j0) = 2*refMat2.col(j1);
+//     VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test innerVectors()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+    
+    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
+    
+    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
+    if(SparseMatrixType::IsRowMajor)
+      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
+    else
+      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test generic blocks
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
+    
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
+                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+      
+    Index i = internal::random<Index>(0,m2.outerSize()-1);
+    if(SparseMatrixType::IsRowMajor) {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.row(i) = refMat2.row(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    } else {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.col(i) = refMat2.col(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    }
+    
+    Index r0 = internal::random<Index>(0,rows-2);
+    Index c0 = internal::random<Index>(0,cols-2);
+    Index r1 = internal::random<Index>(1,rows-r0);
+    Index c1 = internal::random<Index>(1,cols-c0);
+    
+    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
+    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
+    
+    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
+    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
+
+    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
+    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
+  }
+}
+
+void test_sparse_block()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(1, 1)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(8, 8)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+  }
+}
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index f0a4691e3..a078851c3 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -17,9 +17,9 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::StorageIndex StorageIndex;
 
-  DenseRhs refX = dA.lu().solve(db);
+  DenseRhs refX = dA.householderQr().solve(db);
   {
-    Rhs x(b.rows(), b.cols());
+    Rhs x(A.cols(), b.cols());
     Rhs oldb = b;
 
     solver.compute(A);
@@ -94,7 +94,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
   
   // test dense Block as the result and rhs:
   {
-    DenseRhs x(db.rows(), db.cols());
+    DenseRhs x(refX.rows(), refX.cols());
     DenseRhs oldb(db);
     x.setZero();
     x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
@@ -119,7 +119,7 @@ void check_sparse_solving_real_cases(Solver& solver, const typename Solver::Matr
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::RealScalar RealScalar;
   
-  Rhs x(b.rows(), b.cols());
+  Rhs x(A.cols(), b.cols());
   
   solver.compute(A);
   if (solver.info() != Success)
@@ -230,7 +230,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -304,12 +304,12 @@ template<typename Solver> void check_sparse_spd_determinant(Solver& solver)
 }
 
 template<typename Solver, typename DenseMat>
-int generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
 
-  int size = internal::random<int>(1,maxSize);
+  Index size = internal::random<int>(1,maxSize);
   double density = (std::max)(8./(size*size), 0.01);
   
   A.resize(size,size);
@@ -324,7 +324,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -333,7 +333,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
   Mat A;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    int size = generate_sparse_square_problem(solver, A, dA);
+    Index size = generate_sparse_square_problem(solver, A, dA);
 
     A.makeCompressed();
     DenseVector b = DenseVector::Random(size);
@@ -410,3 +410,53 @@ template<typename Solver> void check_sparse_square_abs_determinant(Solver& solve
   }
 }
 
+template<typename Solver, typename DenseMat>
+void generate_sparse_leastsquare_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+
+  int rows = internal::random<int>(1,maxSize);
+  int cols = internal::random<int>(1,rows);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+
+  initSparse<Scalar>(density, dA, A, options);
+}
+
+template<typename Solver> void check_sparse_leastsquare_solving(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  int rhsCols = internal::random<int>(1,16);
+
+  Mat A;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    generate_sparse_leastsquare_problem(solver, A, dA);
+
+    A.makeCompressed();
+    DenseVector b = DenseVector::Random(A.rows());
+    DenseMatrix dB(A.rows(),rhsCols);
+    SpMat B(A.rows(),rhsCols);
+    double density = (std::max)(8./(A.rows()*rhsCols), 0.1);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    B.makeCompressed();
+    check_sparse_solving(solver, A, b,  dA, b);
+    check_sparse_solving(solver, A, dB, dA, dB);
+    check_sparse_solving(solver, A, B,  dA, dB);
+    
+    // check only once
+    if(i==0)
+    {
+      b = DenseVector::Zero(A.rows());
+      check_sparse_solving(solver, A, b, dA, b);
+    }
+  }
+}
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index d8815263a..6f7b72167 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -81,7 +81,7 @@ void construct_at_boundary(int boundary)
 
 void unalignedassert()
 {
-  #if EIGEN_ALIGN_STATICALLY
+#if EIGEN_ALIGN_STATICALLY
   construct_at_boundary<Vector2f>(4);
   construct_at_boundary<Vector3f>(4);
   construct_at_boundary<Vector4f>(16);
@@ -100,7 +100,7 @@ void unalignedassert()
   construct_at_boundary<Vector3cf>(4);
   construct_at_boundary<Vector2cd>(EIGEN_ALIGN_BYTES);
   construct_at_boundary<Vector3cd>(16);
-  #endif
+#endif
 
   check_unalignedassert_good<TestNew1>();
   check_unalignedassert_good<TestNew2>();
@@ -112,11 +112,12 @@ void unalignedassert()
   check_unalignedassert_good<Depends<true> >();
 
 #if EIGEN_ALIGN_STATICALLY
-  if(EIGEN_ALIGN_BYTES==16)
+  if(EIGEN_ALIGN_BYTES>=16)
   {
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
   }
   for(int b=8; b<EIGEN_ALIGN_BYTES; b+=8)
   {
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 2f839cf51..97477072a 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -214,7 +214,7 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
             >(DefaultTraversal,CompleteUnrolling)));
 
     VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(),
-                        PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling)));
+                        InnerVectorizedTraversal, CompleteUnrolling)));
     #endif
 
     VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3),
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index e06f1238b..6faf4585d 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -9,3 +9,4 @@ install(FILES
   )
 
 add_subdirectory(src)
+add_subdirectory(CXX11)
\ No newline at end of file
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
new file mode 100644
index 000000000..f1d9f0482
--- /dev/null
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry)
+
+install(FILES
+  ${Eigen_CXX11_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+  )
+
+add_subdirectory(src)
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
new file mode 100644
index 000000000..d90ee1b0f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(Core)
+add_subdirectory(Tensor)
+add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
new file mode 100644
index 000000000..28571dcb9
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
new file mode 100644
index 000000000..1e3b14712
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_Core_util_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
new file mode 100644
index 000000000..6d4b3ea0d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_Tensor_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index b2b28826a..ca6681a1f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -526,48 +526,101 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorLayoutSwapOp<Derived>
+    const TensorLayoutSwapOp<const Derived>
     swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
       return TensorLayoutSwapOp<Derived>(derived());
     }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorConcatenationOp<const Axis, Derived, OtherDerived>
-    concatenate(const OtherDerived& other, const Axis& axis) const {
-      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other.derived(), axis);
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
     }
     template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReshapingOp<const NewDimensions, Derived>
-    reshape(const NewDimensions& newDimensions) const {
+    reshape(const NewDimensions& newDimensions) {
       return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
     }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
     template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorSlicingOp<const StartIndices, const Sizes, Derived>
-    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
       return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
     }
+
     template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorChippingOp<DimId, Derived>
+    const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
       return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
     }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorChippingOp<Dynamic, Derived>
-    chip(const Index offset, const Index dim) const {
+    chip(const Index offset, const Index dim) {
       return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
     }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
     template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReverseOp<const ReverseDimensions, Derived>
-    reverse(const ReverseDimensions& rev) const {
+    reverse(const ReverseDimensions& rev) {
       return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
     }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shuffle) const {
+    shuffle(const Shuffle& shuffle) {
       return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
     }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorStridingOp<const Strides, Derived>
-    stride(const Strides& strides) const {
+    stride(const Strides& strides) {
       return TensorStridingOp<const Strides, Derived>(derived(), strides);
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 649bdb308..7a67c56b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -21,8 +21,7 @@ namespace Eigen {
   * Example:
   *    C.device(EIGEN_GPU) = A + B;
   *
-  * Todo: thread pools.
-  * Todo: operator +=, -=, *= and so on.
+  * Todo: operator *= and /=.
   */
 
 template <typename ExpressionType, typename DeviceType> class TensorDevice {
@@ -50,6 +49,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const DeviceType& m_device;
     ExpressionType& m_expression;
@@ -82,6 +93,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const ThreadPoolDevice& m_device;
     ExpressionType& m_expression;
@@ -114,6 +137,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const GpuDevice& m_device;
     ExpressionType m_expression;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 38586d067..25f085a59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -77,7 +77,7 @@ template <typename T> struct MeanReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size);
+    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
   }
 
   protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index a9d0f6c39..bdc6ddb87 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -30,14 +30,14 @@ std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccesso
   typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
   typedef typename T::Index Index;
   typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
+  const Index total_size = tensor.dimensions().TotalSize();
 
   // Print the tensor as a 1d vector or a 2d matrix.
   if (internal::array_size<Dimensions>::value == 1) {
     Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
     os << array;
   } else {
-    const Index first_dim = tensor.dimensions()[0];
+    const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
     static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
     Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
     os << matrix;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 9a35b044d..0745b1742 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -65,7 +65,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
   };
 };
 
@@ -80,7 +80,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit),
   };
 };
 
@@ -97,7 +97,7 @@ struct traits<TensorMap<PlainObjectType, Options_> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = Options_,
-    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
   };
 };
 
@@ -113,7 +113,7 @@ struct traits<TensorRef<PlainObjectType> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = BaseTraits::Options,
-    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
   };
 };
 
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
new file mode 100644
index 000000000..6e871a8da
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
@@ -0,0 +1,8 @@
+FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_TensorSymmetry_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
+  )
+
+add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
new file mode 100644
index 000000000..dc9fc78ec
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 8e42965a3..89036886b 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -141,20 +141,32 @@ int main()
     public:
       typedef mpfr::mpreal ResScalar;
       enum {
+        Vectorizable = false,
+        LhsPacketSize = 1,
+        RhsPacketSize = 1,
+        ResPacketSize = 1,
+        NumberOfRegisters = 1,
         nr = 1,
         mr = 1,
         LhsProgress = 1,
         RhsProgress = 1
       };
+      typedef ResScalar LhsPacket;
+      typedef ResScalar RhsPacket;
+      typedef ResScalar ResPacket;
+      
     };
 
-    template<typename Index, bool ConjugateLhs, bool ConjugateRhs>
-    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,1,1,ConjugateLhs,ConjugateRhs>
+
+
+    template<typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,DataMapper,1,1,ConjugateLhs,ConjugateRhs>
     {
       typedef mpfr::mpreal mpreal;
 
       EIGEN_DONT_INLINE
-      void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha,
+      void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, 
+                      Index rows, Index depth, Index cols, const mpreal& alpha,
                       Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
       {
         if(rows==0 || cols==0 || depth==0)
@@ -170,8 +182,6 @@ int main()
         {
           for(Index j=0; j<cols; ++j)
           {
-            mpreal *C1 = res + j*resStride;
-            
             const mpreal *A = blockA + i*strideA + offsetA;
             const mpreal *B = blockB + j*strideB + offsetB;
             
@@ -183,7 +193,7 @@ int main()
             }
             
             mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
-            mpfr_add(C1[i].mpfr_ptr(), C1[i].mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
+            mpfr_add(res(i,j).mpfr_ptr(), res(i,j).mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
           }
         }
       }
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 25ff4228d..100e617b2 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -18,7 +18,7 @@ namespace Eigen {
 namespace internal 
 {
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value)
   {
     line >> i >> j >> value;
     i--;
@@ -31,7 +31,7 @@ namespace internal
       return false;
   }
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value)
   {
     Scalar valR, valI;
     line >> i >> j >> valR >> valI;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 806ea77b5..f952af752 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -50,7 +50,7 @@ if(MPFR_FOUND)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-#  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+ ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index 186f56ac3..b1ff8aecb 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -54,7 +54,7 @@ static void test_equality()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        if (random() < 0.5) {
+        if (internal::random<bool>()) {
           mat2(i,j,k) = mat1(i,j,k);
         }
       }
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index 0ffb02afd..ad9c9da39 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -13,8 +13,6 @@
 using Eigen::Tensor;
 
 
-
-
 static void test_simple_assign()
 {
   Tensor<int, 3> random(2,3,7);
@@ -33,7 +31,32 @@ static void test_simple_assign()
   }
 }
 
+
+static void test_assign_of_const_tensor()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7);
+  const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7);
+
+  Tensor<int, 2> result1 = constant1.chip(0, 2);
+  Tensor<int, 2> result2 = constant2.chip(0, 2);
+  Tensor<int, 2> result3 = constant3.chip(0, 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0));
+    }
+  }
+}
+
+
 void test_cxx11_tensor_const()
 {
   CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_assign_of_const_tensor());
 }
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 695565e9b..8389e9840 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -260,7 +260,7 @@ static void test_type_casting()
   mat1.setRandom();
   mat2.setRandom();
 
-  mat3 = mat1.template cast<double>();
+  mat3 = mat1.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -269,7 +269,7 @@ static void test_type_casting()
     }
   }
 
-  mat3 = mat2.template cast<double>();
+  mat3 = mat2.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index aa369f278..c8f105e3d 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -196,6 +196,45 @@ static void test_coeff_ref()
 }
 
 
+static void test_nested_ops_with_ref()
+{
+  Tensor<float, 4> t(2, 3, 5, 7);
+  t.setRandom();
+  TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3);
+  TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = std::make_pair(0, 0);
+  trivial[1] = std::make_pair(0, 0);
+  trivial[2] = std::make_pair(0, 0);
+  trivial[3] = std::make_pair(0, 0);
+  Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+
 void test_cxx11_tensor_ref()
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
@@ -205,4 +244,5 @@ void test_cxx11_tensor_ref()
   CALL_SUBTEST(test_ref_of_ref());
   CALL_SUBTEST(test_ref_in_expr());
   CALL_SUBTEST(test_coeff_ref());
+  CALL_SUBTEST(test_nested_ops_with_ref());
 }
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index dddda7dd9..7d6f4e79f 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -57,7 +57,8 @@
 #include <limits>
 
 // Options
-#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC. 
+// FIXME HAVE_INT64_SUPPORT leads to clashes with long int and int64_t on some systems.
+//#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC.
 #define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
 #define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
                                                 // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.