From 4084dce038ea8b1011be6068b68e3aa10523ac07 Mon Sep 17 00:00:00 2001
From: Abhijit Kundu <abhijit.kundu@gatech.edu>
Date: Thu, 26 Feb 2015 16:50:09 -0500
Subject: [PATCH 01/83] Added CMake support for Tensor module. CMake now
 installs CXX11 Tensor module like the rest of the unsupported modules

---
 unsupported/Eigen/CMakeLists.txt                          | 1 +
 unsupported/Eigen/CXX11/CMakeLists.txt                    | 8 ++++++++
 unsupported/Eigen/CXX11/src/CMakeLists.txt                | 3 +++
 unsupported/Eigen/CXX11/src/Core/CMakeLists.txt           | 1 +
 unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt      | 6 ++++++
 unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt         | 6 ++++++
 unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt | 8 ++++++++
 .../Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt    | 6 ++++++
 8 files changed, 39 insertions(+)
 create mode 100644 unsupported/Eigen/CXX11/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
 create mode 100644 unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt

diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index e06f1238b..6faf4585d 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -9,3 +9,4 @@ install(FILES
   )
 
 add_subdirectory(src)
+add_subdirectory(CXX11)
\ No newline at end of file
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
new file mode 100644
index 000000000..f1d9f0482
--- /dev/null
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry)
+
+install(FILES
+  ${Eigen_CXX11_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+  )
+
+add_subdirectory(src)
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
new file mode 100644
index 000000000..d90ee1b0f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(Core)
+add_subdirectory(Tensor)
+add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
new file mode 100644
index 000000000..28571dcb9
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
new file mode 100644
index 000000000..1e3b14712
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_Core_util_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
new file mode 100644
index 000000000..6d4b3ea0d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_Tensor_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
new file mode 100644
index 000000000..6e871a8da
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
@@ -0,0 +1,8 @@
+FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_TensorSymmetry_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
+  )
+
+add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
new file mode 100644
index 000000000..dc9fc78ec
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
+  )

From b10cd3afd2ba85d0e8bca68bf0e716391e185837 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 27 Feb 2015 22:38:00 +0100
Subject: [PATCH 02/83] Re-enbale detection of min/max parentheses protection,
 and re-enable mpreal_support unit test.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h |  2 +-
 test/main.h                                       | 10 ++++++++--
 unsupported/test/CMakeLists.txt                   |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 9061fd936..1f57b6de0 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -164,7 +164,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // Perhaps it would make more sense to consider k*n*m??
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
-    if(std::max(k,std::max(m,n))<48)
+    if((std::max)(k,(std::max)(m,n))<48)
       return;
     
     typedef typename Traits::ResScalar ResScalar;
diff --git a/test/main.h b/test/main.h
index 1f937690c..ecf0c6924 100644
--- a/test/main.h
+++ b/test/main.h
@@ -42,13 +42,19 @@
 #include <deque>
 #include <queue>
 #include <list>
+#if __cplusplus >= 201103L
+#include <random>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+#endif
 
 // To test that all calls from Eigen code to std::min() and std::max() are
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-//#define min(A,B) please_protect_your_min_with_parentheses
-//#define max(A,B) please_protect_your_max_with_parentheses
+#define min(A,B) please_protect_your_min_with_parentheses
+#define max(A,B) please_protect_your_max_with_parentheses
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 8542dd853..9eabfb620 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -50,7 +50,7 @@ if(MPFR_FOUND)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-#  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+ ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()

From 9aee1e300ad523d6bdc45a7592416ce96df9a82f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 27 Feb 2015 22:55:12 +0100
Subject: [PATCH 03/83] Increase unit-test L1 cache size to ensure we are doing
 at least 2 peeled loop within product kernel.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 1f57b6de0..0fc7289be 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -155,7 +155,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // In unit tests we do not want to use extra large matrices,
     // so we reduce the cache size to check the blocking strategy is not flawed
 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    l1 = 4*1024;
+    l1 = 9*1024;
     l2 = 32*1024;
     l3 = 512*1024;
 #endif

From 0f82a1d7b7a17302e8dceb370864ba42a6a13102 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 14:52:26 +0100
Subject: [PATCH 04/83] bug #967: Automatically add cxx11 suffix when building
 in C++11 mode

---
 cmake/EigenTesting.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index b4ab95dbc..f8aec777d 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -502,6 +502,10 @@ macro(ei_set_build_string)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit)
   endif()
   
+  if(EIGEN_TEST_CXX11)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
+  endif()
+  
   if(EIGEN_BUILD_STRING_SUFFIX)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
   endif()

From 33f40b2883630b5ddec3bab5730691b40120faa6 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 14:53:11 +0100
Subject: [PATCH 05/83] Cygwin does not like weak linking either.

---
 blas/xerbla.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
index 8775b88cd..c373e8699 100644
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@@ -1,7 +1,7 @@
 
 #include <stdio.h>
 
-#if (defined __GNUC__) && (!defined __MINGW32__)
+#if (defined __GNUC__) && (!defined __MINGW32__) && (!defined __CYGWIN__)
 #define EIGEN_WEAK_LINKING __attribute__ ((weak))
 #else
 #define EIGEN_WEAK_LINKING

From 682196e9fcdef7fb329fde833e30e91bb3f89077 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 16:41:00 +0100
Subject: [PATCH 06/83] Fixed MPRealSupport

---
 unsupported/Eigen/MPRealSupport  | 22 ++++++++++++++++------
 unsupported/test/mpreal/mpreal.h |  3 ++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 8e42965a3..89036886b 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -141,20 +141,32 @@ int main()
     public:
       typedef mpfr::mpreal ResScalar;
       enum {
+        Vectorizable = false,
+        LhsPacketSize = 1,
+        RhsPacketSize = 1,
+        ResPacketSize = 1,
+        NumberOfRegisters = 1,
         nr = 1,
         mr = 1,
         LhsProgress = 1,
         RhsProgress = 1
       };
+      typedef ResScalar LhsPacket;
+      typedef ResScalar RhsPacket;
+      typedef ResScalar ResPacket;
+      
     };
 
-    template<typename Index, bool ConjugateLhs, bool ConjugateRhs>
-    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,1,1,ConjugateLhs,ConjugateRhs>
+
+
+    template<typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,DataMapper,1,1,ConjugateLhs,ConjugateRhs>
     {
       typedef mpfr::mpreal mpreal;
 
       EIGEN_DONT_INLINE
-      void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha,
+      void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, 
+                      Index rows, Index depth, Index cols, const mpreal& alpha,
                       Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
       {
         if(rows==0 || cols==0 || depth==0)
@@ -170,8 +182,6 @@ int main()
         {
           for(Index j=0; j<cols; ++j)
           {
-            mpreal *C1 = res + j*resStride;
-            
             const mpreal *A = blockA + i*strideA + offsetA;
             const mpreal *B = blockB + j*strideB + offsetB;
             
@@ -183,7 +193,7 @@ int main()
             }
             
             mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
-            mpfr_add(C1[i].mpfr_ptr(), C1[i].mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
+            mpfr_add(res(i,j).mpfr_ptr(), res(i,j).mpfr_srcptr(), acc1.mpfr_srcptr(),  mpreal::get_default_rnd());
           }
         }
       }
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index dddda7dd9..7d6f4e79f 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -57,7 +57,8 @@
 #include <limits>
 
 // Options
-#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC. 
+// FIXME HAVE_INT64_SUPPORT leads to clashes with long int and int64_t on some systems.
+//#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC.
 #define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
 #define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
                                                 // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.

From 73dd95e7b03b4451ddf5cee17f4b44f2a7d6d91d Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 16:51:53 +0100
Subject: [PATCH 07/83] Use @CMAKE_MAKE_PROGRAM@ instead of make in
 buildtests.sh

---
 scripts/buildtests.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/buildtests.in b/scripts/buildtests.in
index 7026373cf..d2fd10276 100755
--- a/scripts/buildtests.in
+++ b/scripts/buildtests.in
@@ -14,9 +14,9 @@ targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo`
 
 if [ -n "${EIGEN_MAKE_ARGS:+x}" ]
 then
-  make $targets_to_make ${EIGEN_MAKE_ARGS}
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make ${EIGEN_MAKE_ARGS}
 else
-  make $targets_to_make
+  @CMAKE_MAKE_PROGRAM@ $targets_to_make @EIGEN_TEST_BUILD_FLAGS@
 fi
 exit $?
 

From 31e2ffe82ce2851521db6f8280ff2e91a159d35e Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 18:39:37 +0100
Subject: [PATCH 08/83] Replaced POSIX random() by internal::random

---
 unsupported/test/cxx11_tensor_comparisons.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index 186f56ac3..b1ff8aecb 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -54,7 +54,7 @@ static void test_equality()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        if (random() < 0.5) {
+        if (internal::random<bool>()) {
           mat2(i,j,k) = mat1(i,j,k);
         }
       }

From ef09ce4552daa4bee80e910f6a1d151e84240a2f Mon Sep 17 00:00:00 2001
From: Abhijit Kundu <abhijit.kundu@gatech.edu>
Date: Sat, 28 Feb 2015 21:30:31 -0500
Subject: [PATCH 09/83] Fix for TensorIO for Fixed sized Tensors.

The following code snippet was failing to compile:

TensorFixedSize<double, Sizes<4, 3> > t_4x3;
cout << 4x3;
---
 unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index a9d0f6c39..bdc6ddb87 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -30,14 +30,14 @@ std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccesso
   typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
   typedef typename T::Index Index;
   typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
+  const Index total_size = tensor.dimensions().TotalSize();
 
   // Print the tensor as a 1d vector or a 2d matrix.
   if (internal::array_size<Dimensions>::value == 1) {
     Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
     os << array;
   } else {
-    const Index first_dim = tensor.dimensions()[0];
+    const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
     static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
     Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
     os << matrix;

From 3109f0e74e9d4d4b16409cbd6de6dc297a6b9e05 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 2 Mar 2015 20:09:33 +0100
Subject: [PATCH 10/83] Add SSE vectorization of Quaternion::conjugate.
 Significant speed-up when combined with products like q1*q2.conjugate()

---
 Eigen/src/Geometry/Quaternion.h        | 17 ++++++++--
 Eigen/src/Geometry/arch/Geometry_SSE.h | 46 ++++++++++++++++++++------
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index e1ad803bb..8c44df699 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -441,7 +441,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
                          typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                         (internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned)?Aligned:Unaligned>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
@@ -646,6 +646,16 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+{
+  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
+    return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
+  }
+};
+}
+                         
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
   * if the quaternion is normalized.
   * The conjugate of a quaternion represents the opposite rotation.
@@ -656,7 +666,10 @@ template <class Derived>
 inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return internal::quat_conj<Architecture::Target, Derived,
+                         typename internal::traits<Derived>::Scalar,
+                         internal::traits<Derived>::IsAligned?Aligned:Unaligned>::run(*this);
+                         
 }
 
 /** \returns the angle (in radian) between two rotations
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index 3d8284f2d..e59c32c56 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -20,23 +20,35 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
 {
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
     Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
     __m128 a = _a.coeffs().template packet<Aligned>(0);
     __m128 b = _b.coeffs().template packet<Aligned>(0);
-    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
-                                         vec4f_swizzle1(b,2,0,1,2)),mask);
-    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
-                                         vec4f_swizzle1(b,0,1,2,1)),mask);
+    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstore(&res.x(),
               _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
                                     _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_add_ps(flip1,flip2)));
+                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+    
     return res;
   }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+{
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+    return res;
+  }
+};
+
+
 template<typename VectorLhs,typename VectorRhs>
 struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 {
@@ -56,8 +68,8 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 
 
 
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
+template<class Derived, class OtherDerived, int Alignment>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
 {
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
@@ -66,8 +78,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
   Quaternion<double> res;
 
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
+  Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -108,6 +120,20 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
 }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+{
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<double> res;
+    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
+    const __m128d mask2 = _mm_setr_pd(-0.,0.);
+    pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
+    pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+    return res;
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen

From 1ec0f4fadf1b8d95fb1506e87112d0c7888afd95 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 2 Mar 2015 16:08:54 -0500
Subject: [PATCH 11/83] HalfPacket also needed to be disabled for double, on
 ARMv8.

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 8dd1e1370..d0544bdf1 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -541,12 +541,12 @@ typedef float64x1_t Packet1d;
 template<> struct packet_traits<double>  : default_packet_traits
 {
   typedef Packet2d type;
-  typedef Packet1d half;
+  typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket=1,
+    HasHalfPacket=0,
    
     HasDiv  = 1,
     // FIXME check the Has*

From 9930e9583b336edda281a4490cfe69c53082318e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 2 Mar 2015 18:08:38 -0500
Subject: [PATCH 12/83] Improve analyze-blocking-sizes, and in particular give
 it a evaluate-defaults tool that shows the efficiency of Eigen's default
 blocking sizes choices, using a previously computed table from
 benchmark-blocking-sizes.

---
 bench/analyze-blocking-sizes.cpp | 232 +++++++++++++++++++++++++------
 1 file changed, 186 insertions(+), 46 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index a603c0216..f507eab38 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -7,6 +7,10 @@
 #include <string>
 #include <cmath>
 #include <cassert>
+#include <cstring>
+#include <memory>
+
+#include <Eigen/Core>
 
 using namespace std;
 
@@ -88,6 +92,11 @@ struct preprocessed_inputfile_entry_t
   float efficiency;
 };
 
+bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
+{
+  return e1.efficiency < e2.efficiency;
+}
+
 struct preprocessed_inputfile_t
 {
   string filename;
@@ -394,17 +403,187 @@ void print_partition(
   cout << endl;
 }
 
+struct action_t
+{
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const { abort(); }
+  virtual ~action_t() {}
+};
+
+struct partition_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "partition"; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  {
+    check_all_files_in_same_exact_order(preprocessed_inputfiles);
+
+    float required_efficiency_to_beat = 0.0f;
+    vector<vector<vector<size_t>>> partitions;
+    cerr << "searching for partitions...\r" << flush;
+    while (true)
+    {
+      vector<vector<size_t>> partition;
+      find_partition_with_efficiency_higher_than(
+        preprocessed_inputfiles,
+        required_efficiency_to_beat,
+        partition);
+      float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
+      cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
+           << " subsets for " << 100.0f * actual_efficiency
+           << " % efficiency"
+           << "                  \r" << flush;
+      partitions.push_back(partition);
+      if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
+        break;
+      }
+      required_efficiency_to_beat = actual_efficiency;
+    }
+    cerr << "                                                                  " << endl;
+    while (true) {
+      bool repeat = false;
+      for (size_t i = 0; i < partitions.size() - 1; i++) {
+        if (partitions[i].size() >= partitions[i+1].size()) {
+          partitions.erase(partitions.begin() + i);
+          repeat = true;
+          break;
+        }
+      }
+      if (!repeat) {
+        break;
+      }
+    }
+    for (auto it = partitions.begin(); it != partitions.end(); ++it) {
+      print_partition(preprocessed_inputfiles, *it);
+    }
+  }
+};
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  size_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+struct evaluate_defaults_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "evaluate-defaults"; }
+  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  {
+    if (preprocessed_inputfiles.size() > 1) {
+      cerr << invokation_name() << " only works with one input file." << endl;
+      exit(1);
+    }
+
+    const preprocessed_inputfile_t& preprocessed_inputfile = preprocessed_inputfiles.front();
+
+    uint16_t product_size = 0;
+    uint16_t default_block_size = 0;
+    vector<preprocessed_inputfile_entry_t> results, cubic_results;
+    for (auto it = preprocessed_inputfile.entries.begin(); it != preprocessed_inputfile.entries.end(); ++it) {
+      if (it->product_size != product_size) {
+        product_size = it->product_size;
+        size_triple_t product_size_triple(product_size);
+        Eigen::Index k = product_size_triple.k,
+                     m = product_size_triple.m,
+                     n = product_size_triple.n;
+        Eigen::internal::computeProductBlockingSizes<float, float>(k, m, n);
+        default_block_size = compact_size_triple(k, m, n);
+      }
+      if (it->block_size == default_block_size) {
+        results.push_back(*it);
+        if (size_triple_t(product_size).is_cubic()) {
+          cubic_results.push_back(*it);
+        }
+      }
+    }
+
+    cerr << "Below are all results - first column is product size tripe, " << endl
+         << "second column is block size triple, in hex kmn form where" << endl
+         << "k, m, n are the log2 of the actual values, i.e. a89 means" << endl
+         << "k=1024, m=256, n=512." << endl << endl;
+    for (auto it = results.begin(); it != results.end(); ++it) {
+      cerr << hex << it->product_size << " " << it->block_size
+           << " efficiency " << std::dec << 100.0f * it->efficiency << " %" << endl;
+    }
+    cerr << endl;
+    sort(results.begin(), results.end(), lower_efficiency);
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    cerr << "Efficiency summary: min = "
+         << 100.0f * results.front().efficiency << " %, max = "
+         << 100.0f * results.back().efficiency << " %, median = "
+         << 100.0f * results[results.size() / 2].efficiency << " %" << endl;
+    cerr << "20% of product sizes have efficiency <= " << 100.0f * results[results.size() * 20 / 100].efficiency << " %" << endl;
+    cerr << "10% of product sizes have efficiency <= " << 100.0f * results[results.size() * 10 / 100].efficiency << " %" << endl;
+    cerr << "5% of product sizes have efficiency <= " << 100.0f * results[results.size() * 5 / 100].efficiency << " %" << endl;
+    cerr << "Cubic sizes efficiency summary: min = "
+         << 100.0f * cubic_results.front().efficiency << " %, max = "
+         << 100.0f * cubic_results.back().efficiency << " %, median = "
+         << 100.0f * cubic_results[cubic_results.size() / 2].efficiency << " %" << endl;
+    
+  }
+};
+
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> <input files...>" << endl;
+  cerr << "available actions:" << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  } 
+  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
+  exit(1);
+}
+
 int main(int argc, char* argv[])
 {
-  if (argc == 1) {
-    cerr << "usage: " << argv[0] << " [input files]" << endl;
-    cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
-    exit(1);
-  }
   cout.precision(3);
   cerr.precision(3);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new partition_action_t);
+  available_actions.emplace_back(new evaluate_defaults_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 2) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
   vector<string> inputfilenames;
-  for (int i = 1; i < argc; i++) {
+  for (int i = 2; i < argc; i++) {
   	inputfilenames.emplace_back(argv[i]);
   }
 
@@ -413,44 +592,5 @@ int main(int argc, char* argv[])
     preprocessed_inputfiles.emplace_back(inputfile_t(*it));
   }
 
-  check_all_files_in_same_exact_order(preprocessed_inputfiles);
-
-  float required_efficiency_to_beat = 0.0f;
-  vector<vector<vector<size_t>>> partitions;
-  cerr << "searching for partitions...\r" << flush;
-  while (true)
-  {
-    vector<vector<size_t>> partition;
-    find_partition_with_efficiency_higher_than(
-      preprocessed_inputfiles,
-      required_efficiency_to_beat,
-      partition);
-    float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
-    cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
-         << " subsets for " << 100.0f * actual_efficiency
-         << " % efficiency"
-         << "                  \r" << flush;
-    partitions.push_back(partition);
-    if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
-      break;
-    }
-    required_efficiency_to_beat = actual_efficiency;
-  }
-  cerr << "                                                                  " << endl;
-  while (true) {
-    bool repeat = false;
-    for (size_t i = 0; i < partitions.size() - 1; i++) {
-      if (partitions[i].size() >= partitions[i+1].size()) {
-        partitions.erase(partitions.begin() + i);
-        repeat = true;
-        break;
-      }
-    }
-    if (!repeat) {
-      break;
-    }
-  }
-  for (auto it = partitions.begin(); it != partitions.end(); ++it) {
-    print_partition(preprocessed_inputfiles, *it);
-  }
+  (*action)->run(preprocessed_inputfiles);
 }

From f8390995127f9f73f2376c43f93eaa27bbad3675 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 3 Mar 2015 09:35:22 -0500
Subject: [PATCH 13/83] Work around an ICE in Clang 3.5 in the iOS toolchain
 with double NEON intrinsics.

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index d0544bdf1..ce0abfd80 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -518,7 +518,19 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 }
 
 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 #if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
 // Bug 907: workaround missing declarations of the following two functions in the ADK

From ccc1277a42f254b184b750292a7cd672a58bbc63 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 3 Mar 2015 10:17:05 -0500
Subject: [PATCH 14/83] must also disable complex<double> when disabling double
 vectorization

---
 Eigen/src/Core/arch/NEON/Complex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 154daa7a7..c7fb12fe8 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -272,7 +272,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 }
 
 //---------- double ----------
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
 

From 37a93c4263324011242941cff87d444e9c465422 Mon Sep 17 00:00:00 2001
From: Marc Glisse <marc.glisse@inria.fr>
Date: Tue, 3 Mar 2015 17:08:28 +0100
Subject: [PATCH 15/83] New scoring functor to select the pivot. This is can be
 useful for non-floating point scalars, where choosing the biggest element is
 generally not the best choice.

---
 Eigen/src/Core/functors/UnaryFunctors.h | 28 ++++++++++++
 Eigen/src/LU/FullPivLU.h                | 11 +++--
 Eigen/src/LU/PartialPivLU.h             |  8 ++--
 Eigen/src/QR/FullPivHouseholderQR.h     | 10 ++--
 doc/CustomizingEigen.dox                | 61 +++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index ec42e6850..f32f0f113 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -55,6 +55,34 @@ struct functor_traits<scalar_abs_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
 /** \internal
   * \brief Template functor to compute the squared absolute value of a scalar
   *
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 49c8b183d..d1a260a37 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -459,14 +459,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
@@ -479,7 +481,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index e57b36bc5..3d8825a97 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -275,6 +275,8 @@ struct partial_lu_impl
     */
   static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
@@ -286,13 +288,13 @@ struct partial_lu_impl
       Index rcols = cols-k-1;
         
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 7d5e58d2f..4952fbb46 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -443,13 +443,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox
index 0850863aa..cb25f4ec9 100644
--- a/doc/CustomizingEigen.dox
+++ b/doc/CustomizingEigen.dox
@@ -157,6 +157,67 @@ inline adouble abs2(const adouble& x)  { return x*x; }
 #endif // ADOLCSUPPORT_H
 \endcode
 
+This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
+
+\code
+#include <gmpxx.h>
+#include <Eigen/Core>
+#include <boost/operators.hpp>
+
+namespace Eigen {
+  template<class> struct NumTraits;
+  template<> struct NumTraits<mpq_class>
+  {
+    typedef mpq_class Real;
+    typedef mpq_class NonInteger;
+    typedef mpq_class Nested;
+
+    static inline Real epsilon() { return 0; }
+    static inline Real dummy_precision() { return 0; }
+
+    enum {
+      IsInteger = 0,
+      IsSigned = 1,
+      IsComplex = 0,
+      RequireInitialization = 1,
+      ReadCost = 6,
+      AddCost = 150,
+      MulCost = 100
+    };
+  };
+
+  namespace internal {
+    template<>
+      struct significant_decimals_impl<mpq_class>
+      {
+	// Infinite precision when printing
+	static inline int run() { return 0; }
+      };
+
+    template<> struct scalar_score_coeff_op<mpq_class> {
+      struct result_type : boost::totally_ordered1<result_type> {
+	std::size_t len;
+	result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
+	result_type(mpq_class const& q) :
+	  len(mpz_size(q.get_num_mpz_t())+
+	      mpz_size(q.get_den_mpz_t())-1) {}
+	friend bool operator<(result_type x, result_type y) {
+	  // 0 is the worst possible pivot
+	  if (x.len == 0) return y.len > 0;
+	  if (y.len == 0) return false;
+	  // Prefer a pivot with a small representation
+	  return x.len > y.len;
+	}
+	friend bool operator==(result_type x, result_type y) {
+	  // Only used to test if the score is 0
+	  return x.len == y.len;
+	}
+      };
+      result_type operator()(mpq_class const& x) const { return x; }
+    };
+  }
+}
+\endcode
 
 \sa \ref TopicPreprocessorDirectives
 

From eae8e27b7d92c31fe07d18f0b6d0feded67f7096 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 3 Mar 2015 11:41:21 -0500
Subject: [PATCH 16/83] Add a benchmark-default-sizes action to
 benchmark-blocking-sizes.cpp

---
 .../Core/products/GeneralBlockPanelKernel.h   |  28 +-
 bench/analyze-blocking-sizes.cpp              |   4 +-
 bench/benchmark-blocking-sizes.cpp            | 249 ++++++++++++------
 3 files changed, 191 insertions(+), 90 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 0fc7289be..fe4ead384 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -86,19 +86,21 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-  EIGEN_UNUSED_VARIABLE(num_threads);
-  enum {
-    kr = 8,
-    mr = Traits::mr,
-    nr = Traits::nr
-  };
-  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-  if (k > kr) k -= k % kr;
-  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-  if (m > mr) m -= m % mr;
-  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
-  if (n > nr) n -= n % nr;
-  return;
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    EIGEN_UNUSED_VARIABLE(num_threads);
+    enum {
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    if (k > kr) k -= k % kr;
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    if (m > mr) m -= m % mr;
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    if (n > nr) n -= n % nr;
+    return;
+  }
 #endif
 
   // Explanations:
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index f507eab38..d25d72d2f 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -559,8 +559,8 @@ void show_usage_and_exit(int argc, char* argv[],
 
 int main(int argc, char* argv[])
 {
-  cout.precision(3);
-  cerr.precision(3);
+  cout.precision(4);
+  cerr.precision(4);
 
   vector<unique_ptr<action_t>> available_actions;
   available_actions.emplace_back(new partition_action_t);
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 04244575a..6d688c0ac 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -3,9 +3,11 @@
 #include <cstdlib>
 #include <vector>
 #include <fstream>
+#include <memory>
 
+bool eigen_use_specific_block_size;
 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
-#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
@@ -82,16 +84,25 @@ struct benchmark_t
 {
   uint16_t compact_product_size;
   uint16_t compact_block_size;
+  bool use_default_block_size;
   float gflops;
   benchmark_t()
     : compact_product_size(0)
     , compact_block_size(0)
     , gflops(0)
+    , use_default_block_size(false)
   {}
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
+    , use_default_block_size(false)
+    , gflops(0)
+  {}
+  benchmark_t(size_t pk, size_t pm, size_t pn)
+    : compact_product_size(compact_size_triple(pk, pm, pn))
+    , compact_block_size(0)
+    , use_default_block_size(true)
     , gflops(0)
   {}
 
@@ -100,10 +111,12 @@ struct benchmark_t
 
 ostream& operator<<(ostream& s, const benchmark_t& b)
 {
-  s << hex;
-  s << b.compact_product_size
-    << " " << b.compact_block_size;
-  s << dec;
+  s << hex << b.compact_product_size << dec;
+  if (b.use_default_block_size) {
+    s << " default";
+  } else {
+    s << " " << hex << b.compact_block_size << dec;
+  }
   s << " " << b.gflops;
   return s;
 }
@@ -121,14 +134,18 @@ bool operator<(const benchmark_t& b1, const benchmark_t& b2)
 
 void benchmark_t::run()
 {
-  // expand our compact benchmark params into proper triples
   size_triple_t productsizes(compact_product_size);
-  size_triple_t blocksizes(compact_block_size);
-  
-  // feed eigen with our custom blocking params
-  eigen_block_size_k = blocksizes.k;
-  eigen_block_size_m = blocksizes.m;
-  eigen_block_size_n = blocksizes.n;
+
+  if (use_default_block_size) {
+    eigen_use_specific_block_size = false;
+  } else {
+    // feed eigen with our custom blocking params
+    eigen_use_specific_block_size = true;
+    size_triple_t blocksizes(compact_block_size);
+    eigen_block_size_k = blocksizes.k;
+    eigen_block_size_m = blocksizes.m;
+    eigen_block_size_n = blocksizes.n;
+  }
 
   // set up the matrix pool
 
@@ -231,9 +248,23 @@ string type_name<double>()
   return "double";
 }
 
-void show_usage_and_exit(const char *progname)
+struct action_t
 {
-  cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl;
+  virtual const char* invokation_name() const { abort(); return nullptr; }
+  virtual void run() const { abort(); }
+  virtual ~action_t() {}
+};
+
+void show_usage_and_exit(int argc, char* argv[],
+                         const vector<unique_ptr<action_t>>& available_actions)
+{
+  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
+  cerr << "available actions:" << endl << endl;
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    cerr << "  " << (*it)->invokation_name() << endl;
+  }
+  cerr << endl;
+  cerr << "options:" << endl << endl;
   cerr << "  --min-working-set-size=N:" << endl;
   cerr << "       Set the minimum working set size to N bytes." << endl;
   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
@@ -245,56 +276,8 @@ void show_usage_and_exit(const char *progname)
   exit(1);
 }
 
-int main(int argc, char* argv[])
+void run_benchmarks(vector<benchmark_t>& benchmarks)
 {
-  for (int i = 1; i < argc; i++) {
-    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
-      const char* equals_sign = strchr(argv[i], '=');
-      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
-    } else {
-      cerr << "unrecognized option: " << argv[i] << endl << endl;
-      show_usage_and_exit(argv[0]);
-    }
-  }
-
-  cout.precision(4);
-
-  print_cpuinfo();
-
-  cout << "benchmark parameters:" << endl;
-  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
-  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
-  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
-  cout << "minsize = " << minsize << endl;
-  cout << "maxsize = " << maxsize << endl;
-  cout << "measurement_repetitions = " << measurement_repetitions << endl;
-  cout << "min_accurate_time = " << min_accurate_time << endl;
-  cout << "min_working_set_size = " << min_working_set_size;
-  if (min_working_set_size == 0) {
-    cout << " (try to outsize caches)";
-  }
-  cout << endl << endl;
-
-
-  // assemble the array of benchmarks without running them at first
-  vector<benchmark_t> benchmarks;
-  for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
-    for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
-      for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
-        for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
-          for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
-            for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
-              for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
-                benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock);
-                benchmarks.push_back(b);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
   // randomly shuffling benchmarks allows us to get accurate enough progress info,
   // as now the cheap/expensive benchmarks are randomly mixed so they average out.
   random_shuffle(benchmarks.begin(), benchmarks.end());
@@ -315,14 +298,23 @@ int main(int argc, char* argv[])
 
       if (i > 10) {
         cerr << ", ETA ";
-        float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done;
-        if (eta > 3600)
-          cerr << eta/3600 << " hours";
-        else if (eta > 60)
-          cerr << eta/60 << " minutes";
-        else cerr << eta << " seconds";
+        int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done);
+        int eta_remainder = eta;
+        if (eta_remainder > 3600) {
+          int hours = eta_remainder / 3600;
+          cerr << hours << " h ";
+          eta_remainder -= hours * 3600;
+        }
+        if (eta_remainder > 60) {
+          int minutes = eta_remainder / 60;
+          cerr << minutes << " min ";
+          eta_remainder -= minutes * 60;
+        }
+        if (eta < 600 && eta_remainder) {
+          cerr << eta_remainder << " s";
+        }
       }
-      cerr << "                                              \r" << flush;
+      cerr << "                                                \r" << flush;
     }
 
     // This is where we actually run a benchmark!
@@ -348,9 +340,116 @@ int main(int argc, char* argv[])
     }
   }
 
-  // Output data.
-  cout << "BEGIN MEASUREMENTS" << endl;
-  for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) {
-    cout << *it << endl;
-  }
+  // keep and return only the best benchmarks
+  benchmarks = best_benchmarks;
+}
+
+struct measure_all_pot_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "measure-all-pot-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
+              for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
+                for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
+                  benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+struct measure_default_sizes_action_t : action_t
+{
+  virtual const char* invokation_name() const { return "measure-default-sizes"; }
+  virtual void run() const
+  {
+    vector<benchmark_t> benchmarks;
+    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
+      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
+        for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
+          for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
+            benchmarks.emplace_back(ksize, msize, nsize);
+          }
+        }
+      }
+    }
+
+    run_benchmarks(benchmarks);
+
+    cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
+    for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
+      cout << *it << endl;
+    }
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  cout.precision(4);
+  cerr.precision(4);
+
+  vector<unique_ptr<action_t>> available_actions;
+  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
+  available_actions.emplace_back(new measure_default_sizes_action_t);
+
+  auto action = available_actions.end();
+
+  if (argc <= 1) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+    if (!strcmp(argv[1], (*it)->invokation_name())) {
+      action = it;
+      break;
+    }
+  }
+
+  if (action == available_actions.end()) {
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
+  for (int i = 2; i < argc; i++) {
+    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
+      const char* equals_sign = strchr(argv[i], '=');
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+    } else {
+      cerr << "unrecognized option: " << argv[i] << endl << endl;
+      show_usage_and_exit(argc, argv, available_actions);
+    }
+  }
+
+  print_cpuinfo();
+
+  cout << "benchmark parameters:" << endl;
+  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
+  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
+  cout << "minsize = " << minsize << endl;
+  cout << "maxsize = " << maxsize << endl;
+  cout << "measurement_repetitions = " << measurement_repetitions << endl;
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
+    cout << " (try to outsize caches)";
+  }
+  cout << endl << endl;
+
+  (*action)->run();
 }

From f64b4480afba333dca8ed628bd73f9a1cddf3dd1 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 3 Mar 2015 11:43:56 -0500
Subject: [PATCH 17/83] Add missing copyright notices

---
 bench/analyze-blocking-sizes.cpp   | 9 +++++++++
 bench/benchmark-blocking-sizes.cpp | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index d25d72d2f..3316f8cbf 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -1,3 +1,12 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 6d688c0ac..f1fef1ede 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -1,3 +1,12 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include <iostream>
 #include <cstdint>
 #include <cstdlib>

From 5d2fd64a1a060a388e6681c3556bb4dc9482943e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 3 Mar 2015 08:56:49 -0800
Subject: [PATCH 18/83] Fixed compilation error when compiling with gcc4.7

---
 unsupported/test/cxx11_tensor_expr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 695565e9b..8389e9840 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -260,7 +260,7 @@ static void test_type_casting()
   mat1.setRandom();
   mat2.setRandom();
 
-  mat3 = mat1.template cast<double>();
+  mat3 = mat1.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -269,7 +269,7 @@ static void test_type_casting()
     }
   }
 
-  mat3 = mat2.template cast<double>();
+  mat3 = mat2.cast<double>();
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {

From 2aa09e6b4ecd28b871c0e48cc7d6dd800bf5de47 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 3 Mar 2015 13:44:00 -0500
Subject: [PATCH 19/83] Fix asm comments in 1px1 kernel

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index fe4ead384..df9f66995 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1481,17 +1481,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
             RhsPacket B_0;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
             do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
               traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
               traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
             } while(false);
 
             EIGEN_GEBGP_ONESTEP(0);
@@ -1506,7 +1506,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             blB += pk*RhsProgress;
             blA += pk*1*Traits::LhsProgress;
 
-            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
           }
 
           // process remaining peeled loop

From 05274219a7c5fdf04bfda089dc3f9eb2923fcc7e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Mar 2015 09:34:27 +0100
Subject: [PATCH 20/83] Add a CG-based solver for rectangular least-square
 problems (bug #975).

---
 Eigen/IterativeLinearSolvers                  |  10 +-
 .../BasicPreconditioners.h                    |  70 +++++-
 .../ConjugateGradient.h                       |  44 ++--
 .../LeastSquareConjugateGradient.h            | 213 ++++++++++++++++++
 test/CMakeLists.txt                           |   1 +
 test/lscg.cpp                                 |  29 +++
 test/sparse_solver.h                          |  58 ++++-
 7 files changed, 392 insertions(+), 33 deletions(-)
 create mode 100644 Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
 create mode 100644 test/lscg.cpp

diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index c06668bd2..0594feb41 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -12,24 +12,26 @@
   * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LSCG for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
   *  - IdentityPreconditioner - not really useful
   *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
   *
   * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
   *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
   */
 
 #include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
 
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index a09f81225..6da423cf6 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -17,9 +17,9 @@ namespace Eigen {
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
   *
   * \tparam _Scalar the type of the scalar.
   *
@@ -28,6 +28,7 @@ namespace Eigen {
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
 template <typename _Scalar>
 class DiagonalPreconditioner
@@ -100,6 +101,69 @@ class DiagonalPreconditioner
     bool m_isInitialized;
 };
 
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LSCG
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  * 
+  * \sa class LSCG, class DiagonalPreconditioner
+  */
+template <typename _Scalar>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
+{
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DiagonalPreconditioner<_Scalar> Base;
+    using Base::m_invdiag;
+  public:
+
+    LeastSquareDiagonalPreconditioner() : Base() {}
+
+    template<typename MatType>
+    explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base()
+    {
+      compute(mat);
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      // Compute the inverse squared-norm of each column of mat
+      m_invdiag.resize(mat.cols());
+      for(Index j=0; j<mat.outerSize(); ++j)
+      {
+        RealScalar sum = mat.innerVector(j).squaredNorm();
+        if(sum>0)
+          m_invdiag(j) = RealScalar(1)/sum;
+        else
+          m_invdiag(j) = RealScalar(1);
+      }
+      Base::m_isInitialized = true;
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+
+  protected:
+};
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index a799c3ef5..10cd94783 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -60,29 +60,29 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   }
   
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);      // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
   Index i = 0;
   while(i < maxIters)
   {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+    tmp.noalias() = mat * p;                    // the bottleneck of the algorithm
 
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
+    Scalar alpha = absNew / p.dot(tmp);         // the amount we travel on dir
+    x += alpha * p;                             // update solution
+    residual -= alpha * tmp;                    // update residual
     
     residualNorm2 = residual.squaredNorm();
     if(residualNorm2 < threshold)
       break;
     
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    z = precond.solve(residual);                // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
     absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    RealScalar beta = absNew / absOld;          // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                           // update search direction
     i++;
   }
   tol_error = sqrt(residualNorm2 / rhsNorm2);
@@ -122,24 +122,24 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double> > cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * \sa class LSCG, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 000000000..beaf5c307
--- /dev/null
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of A'Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                     const Preconditioner& precond, Index& iters,
+                                     typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+  
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual        = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(normal_residual);                         // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();      // the amount we travel on dir
+    x += alpha * p;                                 // update solution
+    residual -= alpha * tmp;                        // update residual
+    normal_residual = mat.adjoint() * residual;     // update residual of the normal equation
+    
+    residualNorm2 = normal_residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(normal_residual);             // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;              // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                               // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
+class LSCG;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<LSCG<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LSCG<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class LSCG : public IterativeSolverBase<LSCG<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<LSCG> Base;
+  using Base::mp_matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  LSCG() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  explicit LSCG(const MatrixType& A) : Base(A) {}
+
+  ~LSCG() {}
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::least_square_conjugate_gradient(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 168749634..1712b8718 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -234,6 +234,7 @@ ei_add_test(sparse_permutations)
 ei_add_test(simplicial_cholesky)
 ei_add_test(conjugate_gradient)
 ei_add_test(bicgstab)
+ei_add_test(lscg)
 ei_add_test(sparselu)
 ei_add_test(sparseqr)
 ei_add_test(umeyama)
diff --git a/test/lscg.cpp b/test/lscg.cpp
new file mode 100644
index 000000000..599ed5619
--- /dev/null
+++ b/test/lscg.cpp
@@ -0,0 +1,29 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse_solver.h"
+#include <Eigen/IterativeLinearSolvers>
+
+template<typename T> void test_lscg_T()
+{
+  LSCG<SparseMatrix<T> > lscg_colmajor_diag;
+  LSCG<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
+
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I)     );
+  
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I)     );
+}
+
+void test_lscg()
+{
+  CALL_SUBTEST_1(test_lscg_T<double>());
+  CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
+}
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index f0a4691e3..f266e2c9a 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -17,9 +17,9 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::StorageIndex StorageIndex;
 
-  DenseRhs refX = dA.lu().solve(db);
+  DenseRhs refX = dA.householderQr().solve(db);
   {
-    Rhs x(b.rows(), b.cols());
+    Rhs x(A.cols(), b.cols());
     Rhs oldb = b;
 
     solver.compute(A);
@@ -94,7 +94,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
   
   // test dense Block as the result and rhs:
   {
-    DenseRhs x(db.rows(), db.cols());
+    DenseRhs x(refX.rows(), refX.cols());
     DenseRhs oldb(db);
     x.setZero();
     x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
@@ -119,7 +119,7 @@ void check_sparse_solving_real_cases(Solver& solver, const typename Solver::Matr
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::RealScalar RealScalar;
   
-  Rhs x(b.rows(), b.cols());
+  Rhs x(A.cols(), b.cols());
   
   solver.compute(A);
   if (solver.info() != Success)
@@ -410,3 +410,53 @@ template<typename Solver> void check_sparse_square_abs_determinant(Solver& solve
   }
 }
 
+template<typename Solver, typename DenseMat>
+void generate_sparse_leastsquare_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+
+  int rows = internal::random<int>(1,maxSize);
+  int cols = internal::random<int>(1,rows);
+  double density = (std::max)(8./(rows*cols), 0.01);
+  
+  A.resize(rows,cols);
+  dA.resize(rows,cols);
+
+  initSparse<Scalar>(density, dA, A, options);
+}
+
+template<typename Solver> void check_sparse_leastsquare_solving(Solver& solver)
+{
+  typedef typename Solver::MatrixType Mat;
+  typedef typename Mat::Scalar Scalar;
+  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+
+  int rhsCols = internal::random<int>(1,16);
+
+  Mat A;
+  DenseMatrix dA;
+  for (int i = 0; i < g_repeat; i++) {
+    generate_sparse_leastsquare_problem(solver, A, dA);
+
+    A.makeCompressed();
+    DenseVector b = DenseVector::Random(A.rows());
+    DenseMatrix dB(A.rows(),rhsCols);
+    SpMat B(A.rows(),rhsCols);
+    double density = (std::max)(8./(A.rows()*rhsCols), 0.1);
+    initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
+    B.makeCompressed();
+    check_sparse_solving(solver, A, b,  dA, b);
+    check_sparse_solving(solver, A, dB, dA, dB);
+    check_sparse_solving(solver, A, B,  dA, dB);
+    
+    // check only once
+    if(i==0)
+    {
+      b = DenseVector::Zero(A.rows());
+      check_sparse_solving(solver, A, b, dA, b);
+    }
+  }
+}

From 3dca4a1efc66d5fce6a6aada7cd7cc248ca8ae40 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Mar 2015 09:35:30 +0100
Subject: [PATCH 21/83] Update manual wrt new LSCG solver.

---
 Eigen/Sparse                | 6 +++---
 doc/SparseLinearSystems.dox | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Eigen/Sparse b/Eigen/Sparse
index 7cc9c0913..a540f0eec 100644
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -11,9 +11,9 @@
   * - \ref SparseQR_Module
   * - \ref IterativeLinearSolvers_Module
   *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
   */
 
 #include "SparseCore"
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 147b55376..13741280a 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -21,6 +21,9 @@ They are summarized in the following table:
 <tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
+<tr><td>LSCG</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
+    <td>built-in, MPL2</td>
+    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
 <tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
     <td>built-in, MPL2</td>
     <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>

From 1ce017836389e5ddef3b0a93cfcbc410db06506f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Mar 2015 09:39:26 +0100
Subject: [PATCH 22/83] Improve efficiency of SparseMatrix::insert/coeffRef for
 sequential outer-index insertion strategies (bug #974)

---
 Eigen/src/SparseCore/SparseMatrix.h | 132 ++++++++++++++++++++++++----
 1 file changed, 117 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4562f3df9..0ba7e111a 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -222,24 +222,18 @@ class SparseMatrix
       * The non zero coefficient must \b not already exist.
       *
       * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
+      * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+      * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be
+      * inserted by increasing outer-indices.
+      * 
+      * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first
+      * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
       *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+      * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(IndexVector::Constant(outerSize(), 2));
-      }
-      return insertUncompressed(row,col);
-    }
+    Scalar& insert(Index row, Index col);
 
   public:
 
@@ -1076,6 +1070,114 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   }
 }
 
+template<typename _Scalar, int _Options, typename _Index>
+typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insert(Index row, Index col)
+{
+  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+  
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+  
+  if(isCompressed())
+  {
+    if(nonZeros()==0)
+    {
+      // reserve space if not already done
+      if(m_data.allocatedSize()==0)
+        m_data.reserve(2*m_innerSize);
+      
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      
+      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      
+      // pack all inner-vectors to the end of the pre-allocated space
+      // and allocate the entire free-space to the first inner-vector
+      StorageIndex end = convert_index(m_data.allocatedSize());
+      for(Index j=1; j<=m_outerSize; ++j)
+        m_outerIndex[j] = end;
+    }
+  }
+  
+  // check whether we can do a fast "push back" insertion
+  Index data_end = m_data.allocatedSize();
+  
+  // First case: we are filling a new inner vector which is packed at the end.
+  // We assume that all remaining inner-vectors are also empty and packed to the end.
+  if(m_outerIndex[outer]==data_end)
+  {
+    eigen_internal_assert(m_innerNonZeros[outer]==0);
+    
+    // pack previous empty inner-vectors to end of the used-space
+    // and allocate the entire free-space to the current inner-vector.
+    StorageIndex p = convert_index(m_data.size());
+    Index j = outer;
+    while(j>=0 && m_innerNonZeros[j]==0)
+      m_outerIndex[j--] = p;
+    
+    // push back the new element
+    ++m_innerNonZeros[outer];
+    m_data.append(Scalar(0), inner);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index j=outer+1; j<=m_outerSize; ++j)
+        if(m_outerIndex[j]==data_end)
+          m_outerIndex[j] = new_end;
+    }
+    return m_data.value(p);
+  }
+  
+  // Second case: the next inner-vector is packed to the end
+  // and the current inner-vector end match the used-space.
+  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
+  {
+    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
+    
+    // add space for the new element
+    ++m_innerNonZeros[outer];
+    m_data.resize(m_data.size()+1);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index j=outer+1; j<=m_outerSize; ++j)
+        if(m_outerIndex[j]==data_end)
+          m_outerIndex[j] = new_end;
+    }
+    
+    // and insert it at the right position (sorted insertion)
+    Index startId = m_outerIndex[outer];
+    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
+    while ( (p > startId) && (m_data.index(p-1) > inner) )
+    {
+      m_data.index(p) = m_data.index(p-1);
+      m_data.value(p) = m_data.value(p-1);
+      --p;
+    }
+    
+    m_data.index(p) = convert_index(inner);
+    return (m_data.value(p) = 0);
+  }
+  
+  // make sure the matrix is compatible to random un-compressed insertion:
+  m_data.resize(m_data.allocatedSize());
+  
+  return insertUncompressed(row,col);
+}
+    
 template<typename _Scalar, int _Options, typename _Index>
 EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
 {

From c43154bbc5cd686b52a67b495875337001b54c49 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Mar 2015 10:16:46 +0100
Subject: [PATCH 23/83] Check for no-reallocation in SparseMatrix::insert (bug
 #974)

---
 Eigen/src/SparseCore/CompressedStorage.h |  3 +++
 test/sparse_basic.cpp                    | 31 +++++++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index cae9c38b0..49fd46658 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -214,6 +214,9 @@ class CompressedStorage
 
     inline void reallocate(Index size)
     {
+      #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+        EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+      #endif
       eigen_internal_assert(size!=m_allocatedSize);
       internal::scoped_array<Scalar> newValues(size);
       internal::scoped_array<StorageIndex> newIndices(size);
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 8021f4db6..e243964f4 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -9,6 +9,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
 #include "sparse.h"
 
 template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
@@ -107,17 +110,31 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       DenseMatrix m1(rows,cols);
       m1.setZero();
       SparseMatrixType m2(rows,cols);
-      if(internal::random<int>()%2)
-        m2.reserve(VectorXi::Constant(m2.outerSize(), 2));
+      bool call_reserve = internal::random<int>()%2;
+      Index nnz = internal::random<int>(1,int(rows)/2);
+      if(call_reserve)
+      {
+        if(internal::random<int>()%2)
+          m2.reserve(VectorXi::Constant(m2.outerSize(), int(nnz)));
+        else
+          m2.reserve(m2.outerSize() * nnz);
+      }
+      g_realloc_count = 0;
       for (Index j=0; j<cols; ++j)
       {
-        for (Index k=0; k<rows/2; ++k)
+        for (Index k=0; k<nnz; ++k)
         {
           Index i = internal::random<Index>(0,rows-1);
           if (m1.coeff(i,j)==Scalar(0))
             m2.insert(i,j) = m1(i,j) = internal::random<Scalar>();
         }
       }
+      
+      if(call_reserve && !SparseMatrixType::IsRowMajor)
+      {
+        VERIFY(g_realloc_count==0);
+      }
+      
       m2.finalize();
       VERIFY_IS_APPROX(m2,m1);
     }
@@ -575,7 +592,7 @@ void big_sparse_triplet(Index rows, Index cols, double density) {
 void test_sparse_basic()
 {
   for(int i = 0; i < g_repeat; i++) {
-    int r = Eigen::internal::random<int>(1,100), c = Eigen::internal::random<int>(1,100);
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
     if(Eigen::internal::random<int>(0,4) == 0) {
       r = c; // check square matrices in 25% of tries
     }
@@ -588,6 +605,12 @@ void test_sparse_basic()
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
     
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
   }

From 168ceb271e631c17da1c722119746ef5931fdca5 Mon Sep 17 00:00:00 2001
From: Jan Blechta <blechta@karlin.mff.cuni.cz>
Date: Wed, 18 Feb 2015 14:26:10 +0100
Subject: [PATCH 24/83] Really use zero guess in ConjugateGradients::solve as
 documented and expected for consistency with other methods.

---
 Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 10cd94783..fab5fdb1f 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -208,7 +208,7 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-    x.setOnes();
+    x.setZero();
     _solve_with_guess_impl(b.derived(),x);
   }
 

From 00ea12188167411da9f83bbebedbc8822143eaf0 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 4 Mar 2015 09:30:56 -0500
Subject: [PATCH 25/83] Complete the tool to analyze the efficiency of default
 sizes.

---
 bench/analyze-blocking-sizes.cpp   | 384 ++++++++++++++++++++---------
 bench/benchmark-blocking-sizes.cpp |   9 +-
 2 files changed, 274 insertions(+), 119 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index 3316f8cbf..5c26582cc 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -23,20 +23,63 @@
 
 using namespace std;
 
+const int default_precision = 4;
+
+uint8_t log2_pot(size_t x) {
+  size_t l = 0;
+  while (x >>= 1) l++;
+  return l;
+}
+
+uint16_t compact_size_triple(size_t k, size_t m, size_t n)
+{
+  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
+}
+
+// just a helper to store a triple of K,M,N sizes for matrix product
+struct size_triple_t
+{
+  uint16_t k, m, n;
+  size_triple_t() : k(0), m(0), n(0) {}
+  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
+  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
+  size_triple_t(uint16_t compact)
+  {
+    k = 1 << ((compact & 0xf00) >> 8);
+    m = 1 << ((compact & 0x0f0) >> 4);
+    n = 1 << ((compact & 0x00f) >> 0);
+  }
+  bool is_cubic() const { return k == m && m == n; }
+};
+
+ostream& operator<<(ostream& s, const size_triple_t& t)
+{
+  return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
+}
+
 struct inputfile_entry_t
 {
   uint16_t product_size;
-  uint16_t block_size;
+  uint16_t pot_block_size;
+  size_triple_t nonpot_block_size;
   float gflops;
 };
 
 struct inputfile_t
 {
+  enum class type_t {
+    unknown,
+    all_pot_sizes,
+    default_sizes
+  };
+
   string filename;
   vector<inputfile_entry_t> entries;
+  type_t type;
 
   inputfile_t(const string& fname)
     : filename(fname)
+    , type(type_t::unknown)
   {
     ifstream stream(filename);
     if (!stream.is_open()) {
@@ -44,52 +87,96 @@ struct inputfile_t
       exit(1);
     }
     string line;
-    bool is_in_measurements = false;
     while (getline(stream, line)) {
       if (line.empty()) continue;
-      if (line.find("BEGIN MEASUREMENTS") == 0) {
-        is_in_measurements = true;
+      if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::all_pot_sizes;
         continue;
       }
-
-      if (!is_in_measurements) {
+      if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
+        if (type != type_t::unknown) {
+          cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
+          exit(1);
+        }
+        type = type_t::default_sizes;
         continue;
       }
+      
 
-      unsigned int product_size, block_size;
-      float gflops;
-      int sscanf_result =
-        sscanf(line.c_str(), "%x %x %f",
-               &product_size,
-               &block_size,
-               &gflops);
-      if (3 != sscanf_result ||
-          !product_size ||
-          product_size > 0xfff ||
-          !block_size ||
-          block_size > 0xfff ||
-          !isfinite(gflops))
-      {
-        cerr << "ill-formed input file: " << filename << endl;
-        cerr << "offending line:" << endl << line << endl;
-        exit(1);
+      if (type == type_t::unknown) {
+        continue;
+      }
+      switch(type) {
+        case type_t::all_pot_sizes: {
+          unsigned int product_size, block_size;
+          float gflops;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x %x %f",
+                   &product_size,
+                   &block_size,
+                   &gflops);
+          if (3 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !block_size ||
+              block_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = uint16_t(block_size);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        case type_t::default_sizes: {
+          unsigned int product_size;
+          float gflops;
+          int bk, bm, bn;
+          int sscanf_result =
+            sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
+                   &product_size,
+                   &bk, &bm, &bn,
+                   &gflops);
+          if (5 != sscanf_result ||
+              !product_size ||
+              product_size > 0xfff ||
+              !isfinite(gflops))
+          {
+            cerr << "ill-formed input file: " << filename << endl;
+            cerr << "offending line:" << endl << line << endl;
+            exit(1);
+          }
+          inputfile_entry_t entry;
+          entry.product_size = uint16_t(product_size);
+          entry.pot_block_size = 0;
+          entry.nonpot_block_size = size_triple_t(bk, bm, bn);
+          entry.gflops = gflops;
+          entries.push_back(entry);
+          break;
+        }
+        
+        default:
+          break;
       }
-      inputfile_entry_t entry;
-      entry.product_size = uint16_t(product_size);
-      entry.block_size = uint16_t(block_size);
-      entry.gflops = gflops;
-      entries.push_back(entry);
     }
     stream.close();
-    if (!is_in_measurements) {
-      cerr << "Input file " << filename << " didn't contain a BEGIN MEASUREMENTS line. Wrong file?" << endl;
+    if (type == type_t::unknown) {
+      cerr << "Unrecognized input file " << filename << endl;
       exit(1);
     }
     if (entries.empty()) {
       cerr << "didn't find any measurements in input file: " << filename << endl;
       exit(1);
     }
-    //cerr << "read " << entries.size() << " measurements from " << filename << endl;
   }
 };
 
@@ -114,6 +201,9 @@ struct preprocessed_inputfile_t
   preprocessed_inputfile_t(const inputfile_t& inputfile)
     : filename(inputfile.filename)
   {
+    if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
+      abort();
+    }
     auto it = inputfile.entries.begin();
     auto it_first_with_given_product_size = it;
     while (it != inputfile.entries.end()) {
@@ -145,7 +235,7 @@ private:
     for (auto it = begin; it != end; ++it) {
       preprocessed_inputfile_entry_t entry;
       entry.product_size = it->product_size;
-      entry.block_size = it->block_size;
+      entry.block_size = it->pot_block_size;
       entry.efficiency = it->gflops / max_gflops;
       entries.push_back(entry);
     }
@@ -415,15 +505,44 @@ void print_partition(
 struct action_t
 {
   virtual const char* invokation_name() const { abort(); return nullptr; }
-  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const { abort(); }
+  virtual void run(int, char*[]) const { abort(); }
   virtual ~action_t() {}
 };
 
 struct partition_action_t : action_t
 {
   virtual const char* invokation_name() const { return "partition"; }
-  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  virtual void run(int argc, char *argv[]) const
   {
+    vector<preprocessed_inputfile_t> preprocessed_inputfiles;
+
+    if (!argc) {
+      cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
+      exit(1);
+    }
+
+    vector<string> inputfilenames;
+    for (int i = 0; i < argc; i++) {
+      inputfilenames.emplace_back(argv[i]);
+    }
+
+    for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
+      inputfile_t inputfile(*it);
+      switch (inputfile.type) {
+        case inputfile_t::type_t::all_pot_sizes:
+          preprocessed_inputfiles.emplace_back(inputfile);
+          break;
+        case inputfile_t::type_t::default_sizes:
+          cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
+               << "has no use for " << *it << " which contains measurements for default sizes." << endl;
+          exit(1);
+          break;
+        default:
+          cerr << "Unrecognized input file: " << *it << endl;
+          exit(1);
+      }
+    }
+
     check_all_files_in_same_exact_order(preprocessed_inputfiles);
 
     float required_efficiency_to_beat = 0.0f;
@@ -467,89 +586,132 @@ struct partition_action_t : action_t
   }
 };
 
-uint8_t log2_pot(size_t x) {
-  size_t l = 0;
-  while (x >>= 1) l++;
-  return l;
-}
-
-uint16_t compact_size_triple(size_t k, size_t m, size_t n)
-{
-  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
-}
-
-// just a helper to store a triple of K,M,N sizes for matrix product
-struct size_triple_t
-{
-  size_t k, m, n;
-  size_triple_t() : k(0), m(0), n(0) {}
-  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
-  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
-  size_triple_t(uint16_t compact)
-  {
-    k = 1 << ((compact & 0xf00) >> 8);
-    m = 1 << ((compact & 0x0f0) >> 4);
-    n = 1 << ((compact & 0x00f) >> 0);
-  }
-  bool is_cubic() const { return k == m && m == n; }
-};
-
 struct evaluate_defaults_action_t : action_t
 {
-  virtual const char* invokation_name() const { return "evaluate-defaults"; }
-  virtual void run(const vector<preprocessed_inputfile_t>& preprocessed_inputfiles) const
+  struct results_entry_t {
+    uint16_t product_size;
+    size_triple_t default_block_size;
+    uint16_t best_pot_block_size;
+    float default_gflops;
+    float best_pot_gflops;
+    float default_efficiency;
+  };
+  friend ostream& operator<<(ostream& s, const results_entry_t& entry)
   {
-    if (preprocessed_inputfiles.size() > 1) {
-      cerr << invokation_name() << " only works with one input file." << endl;
-      exit(1);
+    return s
+      << "Product size " << size_triple_t(entry.product_size)
+      << ": default block size " << entry.default_block_size
+      << " -> " << entry.default_gflops
+      << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
+      << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
+      << " -> " << entry.best_pot_gflops
+      << " GFlop/s" << dec;
+  }
+  static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
+    return e1.default_efficiency < e2.default_efficiency;
+  }
+  virtual const char* invokation_name() const { return "evaluate-defaults"; }
+  void show_usage_and_exit() const
+  {
+    cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
+    cerr << "checks how well the performance with default sizes compares to the best "
+         << "performance measured over all POT sizes." << endl;
+    exit(1);
+  }
+  virtual void run(int argc, char *argv[]) const
+  {
+    if (argc != 2) {
+      show_usage_and_exit();
     }
-
-    const preprocessed_inputfile_t& preprocessed_inputfile = preprocessed_inputfiles.front();
-
+    inputfile_t inputfile_default_sizes(argv[0]);
+    inputfile_t inputfile_all_pot_sizes(argv[1]);
+    if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
+      cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
+      show_usage_and_exit();
+    }
+    if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
+      cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
+      show_usage_and_exit();
+    }
+    vector<results_entry_t> results;
+    vector<results_entry_t> cubic_results;
+    
     uint16_t product_size = 0;
-    uint16_t default_block_size = 0;
-    vector<preprocessed_inputfile_entry_t> results, cubic_results;
-    for (auto it = preprocessed_inputfile.entries.begin(); it != preprocessed_inputfile.entries.end(); ++it) {
-      if (it->product_size != product_size) {
-        product_size = it->product_size;
-        size_triple_t product_size_triple(product_size);
-        Eigen::Index k = product_size_triple.k,
-                     m = product_size_triple.m,
-                     n = product_size_triple.n;
-        Eigen::internal::computeProductBlockingSizes<float, float>(k, m, n);
-        default_block_size = compact_size_triple(k, m, n);
+    auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
+    for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
+         it_default_sizes != inputfile_default_sizes.entries.end();
+         ++it_default_sizes)
+    {
+      if (it_default_sizes->product_size == product_size) {
+        continue;
       }
-      if (it->block_size == default_block_size) {
-        results.push_back(*it);
-        if (size_triple_t(product_size).is_cubic()) {
-          cubic_results.push_back(*it);
+      product_size = it_default_sizes->product_size;
+      while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
+             it_all_pot_sizes->product_size != product_size)
+      {
+        ++it_all_pot_sizes;
+      }
+      if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
+        break;
+      }
+      uint16_t best_pot_block_size = 0;
+      float best_pot_gflops = 0;
+      for (auto it = it_all_pot_sizes;
+           it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
+           ++it)
+      {
+        if (it->gflops > best_pot_gflops) {
+          best_pot_gflops = it->gflops;
+          best_pot_block_size = it->pot_block_size;
         }
       }
+      results_entry_t entry;
+      entry.product_size = product_size;
+      entry.default_block_size = it_default_sizes->nonpot_block_size;
+      entry.best_pot_block_size = best_pot_block_size;
+      entry.default_gflops = it_default_sizes->gflops;
+      entry.best_pot_gflops = best_pot_gflops;
+      entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
+      results.push_back(entry);
+
+      size_triple_t t(product_size);
+      if (t.k == t.m && t.m == t.n) {
+        cubic_results.push_back(entry);
+      }
     }
 
-    cerr << "Below are all results - first column is product size tripe, " << endl
-         << "second column is block size triple, in hex kmn form where" << endl
-         << "k, m, n are the log2 of the actual values, i.e. a89 means" << endl
-         << "k=1024, m=256, n=512." << endl << endl;
+    cerr << "All results:" << endl;
     for (auto it = results.begin(); it != results.end(); ++it) {
-      cerr << hex << it->product_size << " " << it->block_size
-           << " efficiency " << std::dec << 100.0f * it->efficiency << " %" << endl;
+      cerr << *it << endl;
     }
     cerr << endl;
+
     sort(results.begin(), results.end(), lower_efficiency);
-    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
-    cerr << "Efficiency summary: min = "
-         << 100.0f * results.front().efficiency << " %, max = "
-         << 100.0f * results.back().efficiency << " %, median = "
-         << 100.0f * results[results.size() / 2].efficiency << " %" << endl;
-    cerr << "20% of product sizes have efficiency <= " << 100.0f * results[results.size() * 20 / 100].efficiency << " %" << endl;
-    cerr << "10% of product sizes have efficiency <= " << 100.0f * results[results.size() * 10 / 100].efficiency << " %" << endl;
-    cerr << "5% of product sizes have efficiency <= " << 100.0f * results[results.size() * 5 / 100].efficiency << " %" << endl;
-    cerr << "Cubic sizes efficiency summary: min = "
-         << 100.0f * cubic_results.front().efficiency << " %, max = "
-         << 100.0f * cubic_results.back().efficiency << " %, median = "
-         << 100.0f * cubic_results[cubic_results.size() / 2].efficiency << " %" << endl;
     
+    const size_t n = min<size_t>(20, results.size());
+    cerr << n << " worst results:" << endl;
+    for (size_t i = 0; i < n; i++) {
+      cerr << results[i] << endl;
+    }
+    cerr << endl;
+
+    cerr << "cubic results:" << endl;
+    for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
+      cerr << *it << endl;
+    }
+    cerr << endl;
+
+    sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
+    
+    cerr.precision(2);
+    vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
+    for (auto it = a.begin(); it != a.end(); ++it) {
+      size_t n = min(results.size() - 1, size_t(*it * results.size()));
+      cerr << (100.0f * n / (results.size() - 1))
+           << " % of product sizes have default efficiency <= "
+           << 100.0f * results[n].default_efficiency << " %" << endl;
+    }
+    cerr.precision(default_precision);
   }
 };
 
@@ -568,8 +730,8 @@ void show_usage_and_exit(int argc, char* argv[],
 
 int main(int argc, char* argv[])
 {
-  cout.precision(4);
-  cerr.precision(4);
+  cout.precision(default_precision);
+  cerr.precision(default_precision);
 
   vector<unique_ptr<action_t>> available_actions;
   available_actions.emplace_back(new partition_action_t);
@@ -577,7 +739,7 @@ int main(int argc, char* argv[])
 
   auto action = available_actions.end();
 
-  if (argc <= 2) {
+  if (argc < 2) {
     show_usage_and_exit(argc, argv, available_actions);
   }
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
@@ -591,15 +753,5 @@ int main(int argc, char* argv[])
     show_usage_and_exit(argc, argv, available_actions);
   }
 
-  vector<string> inputfilenames;
-  for (int i = 2; i < argc; i++) {
-  	inputfilenames.emplace_back(argv[i]);
-  }
-
-  vector<preprocessed_inputfile_t> preprocessed_inputfiles;
-  for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
-    preprocessed_inputfiles.emplace_back(inputfile_t(*it));
-  }
-
-  (*action)->run(preprocessed_inputfiles);
+  (*action)->run(argc - 2, argv + 2);
 }
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index f1fef1ede..6925cf560 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -122,7 +122,10 @@ ostream& operator<<(ostream& s, const benchmark_t& b)
 {
   s << hex << b.compact_product_size << dec;
   if (b.use_default_block_size) {
-    s << " default";
+    size_triple_t t(b.compact_product_size);
+    Index k = t.k, m = t.m, n = t.n;
+    internal::computeProductBlockingSizes<MatrixType::Scalar, MatrixType::Scalar>(k, m, n);
+    s << " default(" << k << ", " << m << ", " << n << ")";
   } else {
     s << " " << hex << b.compact_block_size << dec;
   }
@@ -355,7 +358,7 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
 
 struct measure_all_pot_sizes_action_t : action_t
 {
-  virtual const char* invokation_name() const { return "measure-all-pot-sizes"; }
+  virtual const char* invokation_name() const { return "all-pot-sizes"; }
   virtual void run() const
   {
     vector<benchmark_t> benchmarks;
@@ -386,7 +389,7 @@ struct measure_all_pot_sizes_action_t : action_t
 
 struct measure_default_sizes_action_t : action_t
 {
-  virtual const char* invokation_name() const { return "measure-default-sizes"; }
+  virtual const char* invokation_name() const { return "default-sizes"; }
   virtual void run() const
   {
     vector<benchmark_t> benchmarks;

From 2231b3dece9260ee0ba84e9080f11d2015ed5d90 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 4 Mar 2015 09:45:12 -0500
Subject: [PATCH 26/83] output to cout, not cerr, the actual results

---
 bench/analyze-blocking-sizes.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index 5c26582cc..7aae57d93 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -680,38 +680,38 @@ struct evaluate_defaults_action_t : action_t
       }
     }
 
-    cerr << "All results:" << endl;
+    cout << "All results:" << endl;
     for (auto it = results.begin(); it != results.end(); ++it) {
-      cerr << *it << endl;
+      cout << *it << endl;
     }
-    cerr << endl;
+    cout << endl;
 
     sort(results.begin(), results.end(), lower_efficiency);
     
     const size_t n = min<size_t>(20, results.size());
-    cerr << n << " worst results:" << endl;
+    cout << n << " worst results:" << endl;
     for (size_t i = 0; i < n; i++) {
-      cerr << results[i] << endl;
+      cout << results[i] << endl;
     }
-    cerr << endl;
+    cout << endl;
 
-    cerr << "cubic results:" << endl;
+    cout << "cubic results:" << endl;
     for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
-      cerr << *it << endl;
+      cout << *it << endl;
     }
-    cerr << endl;
+    cout << endl;
 
     sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
     
-    cerr.precision(2);
+    cout.precision(2);
     vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
     for (auto it = a.begin(); it != a.end(); ++it) {
       size_t n = min(results.size() - 1, size_t(*it * results.size()));
-      cerr << (100.0f * n / (results.size() - 1))
+      cout << (100.0f * n / (results.size() - 1))
            << " % of product sizes have default efficiency <= "
            << 100.0f * results[n].default_efficiency << " %" << endl;
     }
-    cerr.precision(default_precision);
+    cout.precision(default_precision);
   }
 };
 

From 2dc968e453e347966b7a45c9c497c6b1d3845f80 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 4 Mar 2015 17:03:13 +0100
Subject: [PATCH 27/83] bug #824: improve accuracy of
 Quaternion::angularDistance using atan2 instead of acos.

---
 Eigen/src/Geometry/Quaternion.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 8c44df699..e90ce77eb 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -680,12 +680,10 @@ template <class OtherDerived>
 inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
+  using std::atan2;
   using std::abs;
-  Scalar d = abs(this->dot(other));
-  if (d>=Scalar(1))
-    return Scalar(0);
-  return Scalar(2) * acos(d);
+  Quaternion<Scalar> d = (*this) * other.conjugate();
+  return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) );
 }
 
  

From 755010702856250c5a63c9bcde305ebe1f5e61fa Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 5 Mar 2015 10:03:46 +0100
Subject: [PATCH 28/83] Product optimization: implement a dynamic loop-swapping
 startegy to improve memory accesses to the destination matrix in the case of
 K-rank-update like products, i.e., for products of the kind: "large x small"
 * "small x large"

---
 .../Core/products/GeneralBlockPanelKernel.h   | 57 +++++++++++++++----
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index df9f66995..8f4ee4dbb 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -952,14 +952,28 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
     {
+#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
+      const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION;
+#else
+      const bool swap_loops = depth<48;
+#endif
+    
+      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc3;
+      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3;
+      Index incr1  =  swap_loops ? nr : 3*Traits::LhsProgress;
+      Index incr2  = !swap_loops ? nr : 3*Traits::LhsProgress;
+      
       PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
-
+          
       // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
-      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+      // and on each largest micro vertical panel of rhs (depth * nr)
+      for(Index it1=0; it1<bound1; it1+=incr1)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
-        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        for(Index it2=0; it2<bound2; it2+=incr2)
         {
+          Index i  =  swap_loops ? it2 : it1;
+          Index j2 = !swap_loops ? it2 : it1;
+          
           // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 3 x nr registers.
           
@@ -1097,8 +1111,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r3.storePacket(1 * Traits::ResPacketSize, R1);
           r3.storePacket(2 * Traits::ResPacketSize, R2);
         }
+      }
 
-        // Deal with remaining columns of the rhs
+      // Deal with remaining columns of the rhs
+      if(packet_cols4<cols)
+      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+      {
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
           // One column at a time
@@ -1179,12 +1197,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
+      const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION);
+#else
+      const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48);
+#endif 
+      Index start1 =  swap_loops ? 0 : peeled_mc3;
+      Index start2 = !swap_loops ? 0 : peeled_mc3;
+      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc2;
+      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2;
+      Index incr1  =  swap_loops ? nr : 2*Traits::LhsProgress;
+      Index incr2  = !swap_loops ? nr : 2*Traits::LhsProgress;
+      
+      for(Index it1=start1; it1<bound1; it1+=incr1)
       {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
-        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        for(Index it2=start2; it2<bound2; it2+=incr2)
         {
+          Index i  =  swap_loops ? it2 : it1;
+          Index j2 = !swap_loops ? it2 : it1;
+          
           // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 2 x nr registers.
           
@@ -1290,8 +1321,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r3.storePacket(0 * Traits::ResPacketSize, R2);
           r3.storePacket(1 * Traits::ResPacketSize, R3);
         }
-
-        // Deal with remaining columns of the rhs
+      }
+      
+      // Deal with remaining columns of the rhs
+      if(packet_cols4<cols)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*Traits::LhsProgress)
+      {
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
           // One column at a time

From 4c8b95d5c5295d8d3294ee0e9457e240b019a9f0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 5 Mar 2015 10:16:32 +0100
Subject: [PATCH 29/83] Rename LSCG to LeastSquaresConjugateGradient

---
 Eigen/IterativeLinearSolvers                     |  2 +-
 .../BasicPreconditioners.h                       |  4 ++--
 .../IterativeLinearSolvers/ConjugateGradient.h   |  2 +-
 .../LeastSquareConjugateGradient.h               | 16 ++++++++--------
 test/lscg.cpp                                    |  4 ++--
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index 0594feb41..7fab9eed0 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -12,7 +12,7 @@
   * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
-  *  - LSCG for rectangular least-square problems,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 6da423cf6..3710a8209 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -102,7 +102,7 @@ class DiagonalPreconditioner
 };
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief Jacobi preconditioner for LSCG
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
   *
   * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
@@ -114,7 +114,7 @@ class DiagonalPreconditioner
   *
   * The diagonal entries are pre-inverted and stored into a dense vector.
   * 
-  * \sa class LSCG, class DiagonalPreconditioner
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
   */
 template <typename _Scalar>
 class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index fab5fdb1f..11b8347f7 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -139,7 +139,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class LSCG, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
index beaf5c307..1d819927e 100644
--- a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -95,12 +95,12 @@ void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest
 
 template< typename _MatrixType,
           typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
-class LSCG;
+class LeastSquaresConjugateGradient;
 
 namespace internal {
 
 template< typename _MatrixType, typename _Preconditioner>
-struct traits<LSCG<_MatrixType,_Preconditioner> >
+struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
 {
   typedef _MatrixType MatrixType;
   typedef _Preconditioner Preconditioner;
@@ -129,7 +129,7 @@ struct traits<LSCG<_MatrixType,_Preconditioner> >
     VectorXd x(n), b(m);
     SparseMatrix<double> A(m,n);
     // fill A and b
-    LSCG<SparseMatrix<double> > lscg;
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
     lscg.compute(A);
     x = lscg.solve(b);
     std::cout << "#iterations:     " << lscg.iterations() << std::endl;
@@ -144,9 +144,9 @@ struct traits<LSCG<_MatrixType,_Preconditioner> >
   * \sa class ConjugateGradient, SparseLU, SparseQR
   */
 template< typename _MatrixType, typename _Preconditioner>
-class LSCG : public IterativeSolverBase<LSCG<_MatrixType,_Preconditioner> >
+class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
 {
-  typedef IterativeSolverBase<LSCG> Base;
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
   using Base::mp_matrix;
   using Base::m_error;
   using Base::m_iterations;
@@ -161,7 +161,7 @@ public:
 public:
 
   /** Default constructor. */
-  LSCG() : Base() {}
+  LeastSquaresConjugateGradient() : Base() {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
     * 
@@ -173,9 +173,9 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit LSCG(const MatrixType& A) : Base(A) {}
+  explicit LeastSquaresConjugateGradient(const MatrixType& A) : Base(A) {}
 
-  ~LSCG() {}
+  ~LeastSquaresConjugateGradient() {}
 
   /** \internal */
   template<typename Rhs,typename Dest>
diff --git a/test/lscg.cpp b/test/lscg.cpp
index 599ed5619..daa62a954 100644
--- a/test/lscg.cpp
+++ b/test/lscg.cpp
@@ -12,8 +12,8 @@
 
 template<typename T> void test_lscg_T()
 {
-  LSCG<SparseMatrix<T> > lscg_colmajor_diag;
-  LSCG<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
+  LeastSquaresConjugateGradient<SparseMatrix<T> > lscg_colmajor_diag;
+  LeastSquaresConjugateGradient<SparseMatrix<T>, IdentityPreconditioner> lscg_colmajor_I;
 
   CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag)  );
   CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I)     );

From 5db2baa573965ad8d059038336e272aaab2d025b Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Thu, 5 Mar 2015 13:44:20 -0500
Subject: [PATCH 30/83] Make benchmark-blocking-sizes detect changes to clock
 speed and be resilient to that.

---
 bench/benchmark-blocking-sizes.cpp | 262 +++++++++++++++++++++++------
 1 file changed, 209 insertions(+), 53 deletions(-)

diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 6925cf560..0bf9c07f8 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -37,16 +37,18 @@ const int measurement_repetitions = 3;
 // Timings below this value are too short to be accurate,
 // we'll repeat measurements with more iterations until
 // we get a timing above that threshold.
-const float min_accurate_time = 1e-2f;
+const float g_min_accurate_time = 1e-2f;
 
 // See --min-working-set-size command line parameter.
-size_t min_working_set_size = 0;
+size_t g_min_working_set_size = 0;
 
 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
 const size_t maxsize = 2048;
 const size_t minsize = 16;
 
 typedef MatrixXf MatrixType;
+typedef MatrixType::Scalar Scalar;
+typedef internal::packet_traits<Scalar>::type Packet;
 
 static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
 static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
@@ -95,24 +97,24 @@ struct benchmark_t
   uint16_t compact_block_size;
   bool use_default_block_size;
   float gflops;
-  benchmark_t()
-    : compact_product_size(0)
-    , compact_block_size(0)
-    , gflops(0)
-    , use_default_block_size(false)
-  {}
+  size_t min_working_set_size;
+  float min_accurate_time;
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
     , use_default_block_size(false)
     , gflops(0)
+    , min_working_set_size(g_min_working_set_size)
+    , min_accurate_time(g_min_accurate_time)
   {}
   benchmark_t(size_t pk, size_t pm, size_t pn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(0)
     , use_default_block_size(true)
     , gflops(0)
+    , min_working_set_size(g_min_working_set_size)
+    , min_accurate_time(g_min_accurate_time)
   {}
 
   void run();
@@ -124,7 +126,7 @@ ostream& operator<<(ostream& s, const benchmark_t& b)
   if (b.use_default_block_size) {
     size_triple_t t(b.compact_product_size);
     Index k = t.k, m = t.m, n = t.n;
-    internal::computeProductBlockingSizes<MatrixType::Scalar, MatrixType::Scalar>(k, m, n);
+    internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
     s << " default(" << k << ", " << m << ", " << n << ")";
   } else {
     s << " " << hex << b.compact_block_size << dec;
@@ -162,7 +164,7 @@ void benchmark_t::run()
   // set up the matrix pool
 
   const size_t combined_three_matrices_sizes =
-    sizeof(MatrixType::Scalar) *
+    sizeof(Scalar) *
       (productsizes.k * productsizes.m +
        productsizes.k * productsizes.n +
        productsizes.m * productsizes.n);
@@ -267,7 +269,7 @@ struct action_t
   virtual ~action_t() {}
 };
 
-void show_usage_and_exit(int argc, char* argv[],
+void show_usage_and_exit(int /*argc*/, char* argv[],
                          const vector<unique_ptr<action_t>>& available_actions)
 {
   cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
@@ -287,54 +289,204 @@ void show_usage_and_exit(int argc, char* argv[],
   cerr << "       avoid warm caches." << endl;
   exit(1);
 }
-
-void run_benchmarks(vector<benchmark_t>& benchmarks)
+     
+float measure_clock_speed()
 {
-  // randomly shuffling benchmarks allows us to get accurate enough progress info,
-  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
-  random_shuffle(benchmarks.begin(), benchmarks.end());
+  cerr << "Measuring clock speed...                              \r" << flush;
+          
+  vector<float> all_gflops;
+  for (int i = 0; i < 8; i++) {
+    // a good measure of clock speed is obtained by benchmarking small matrices that
+    // fit in L1 cache and use warm caches (min_working_set_size = 1).
+    benchmark_t b(128, 128, 128);
+    b.min_working_set_size = 1;
+    b.min_accurate_time = 0.1f; // long-running for better accuracy
+    b.run();
+    all_gflops.push_back(b.gflops);
+  }
 
-  // timings here are only used to display progress info.
-  // Whence the use of real time.
-  double time_start = timer.getRealTime();
-  double time_last_progress_update = time_start;
-  for (size_t i = 0; i < benchmarks.size(); i++) {
-    // Display progress info on stderr
-    double time_now = timer.getRealTime();
-    if (time_now > time_last_progress_update + 1.0f) {
-      time_last_progress_update = time_now;
-      float ratio_done = float(i) / benchmarks.size();
-      cerr.precision(3);
-      cerr << "Measurements... " << 100.0f * ratio_done
-           << " %";
+  sort(all_gflops.begin(), all_gflops.end());
+  float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
 
-      if (i > 10) {
-        cerr << ", ETA ";
-        int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done);
-        int eta_remainder = eta;
-        if (eta_remainder > 3600) {
-          int hours = eta_remainder / 3600;
-          cerr << hours << " h ";
-          eta_remainder -= hours * 3600;
+  // multiply by an arbitrary constant to discourage trying doing anything with the
+  // returned values besides just comparing them with each other.
+  float result = stable_estimate * 123.456f;
+
+  return result;
+}
+
+struct human_duration_t
+{
+  int seconds;
+  human_duration_t(int s) : seconds(s) {}
+};
+
+ostream& operator<<(ostream& s, const human_duration_t& d)
+{
+  int remainder = d.seconds;
+  if (remainder > 3600) {
+    int hours = remainder / 3600;
+    s << hours << " h ";
+    remainder -= hours * 3600;
+  }
+  if (remainder > 60) {
+    int minutes = remainder / 60;
+    s << minutes << " min ";
+    remainder -= minutes * 60;
+  }
+  if (d.seconds < 600) {
+    s << remainder << " s";
+  }
+  return s;
+}
+
+void try_run_some_benchmarks(
+  vector<benchmark_t>& benchmarks,
+  double time_start,
+  size_t& first_benchmark_to_run,
+  float& max_clock_speed)
+{
+  if (first_benchmark_to_run == benchmarks.size()) {
+    return;
+  }
+
+  double time_last_progress_update = 0;
+  double time_last_clock_speed_measurement = 0;
+  double time_now = 0;
+
+  size_t benchmark_index = first_benchmark_to_run;
+
+  while (true) {
+    float ratio_done = float(benchmark_index) / benchmarks.size();
+    time_now = timer.getRealTime();
+
+    // We check clock speed every minute and at the end.
+    if (benchmark_index == benchmarks.size() ||
+        time_now > time_last_clock_speed_measurement + 60.0f)
+    {
+      time_last_clock_speed_measurement = time_now;
+
+      // Ensure that clock speed is as expected
+      float current_clock_speed = measure_clock_speed();
+
+      // we only allow 1% higher clock speeds, because we want to know the
+      // clock speed with good accuracy, and this should only cause restarts
+      // at the beginning of the benchmarks run.
+      const float tolerance_higher_clock_speed = 1.01f;
+
+      if (current_clock_speed > tolerance_higher_clock_speed * max_clock_speed) {
+        // Clock speed is now higher than we previously measured.
+        // Either our initial measurement was inaccurate, which won't happen
+        // too many times as we are keeping the best clock speed value and
+        // and allowing some tolerance; or something really weird happened,
+        // which invalidates all benchmark results collected so far.
+        // Either way, we better restart all over again now.
+        if (benchmark_index) {
+          cerr << "Restarting at " << 100.0f * ratio_done
+               << " % because clock speed increased.          " << endl;
         }
-        if (eta_remainder > 60) {
-          int minutes = eta_remainder / 60;
-          cerr << minutes << " min ";
-          eta_remainder -= minutes * 60;
-        }
-        if (eta < 600 && eta_remainder) {
-          cerr << eta_remainder << " s";
+        max_clock_speed = current_clock_speed;
+        first_benchmark_to_run = 0;
+        return;
+      }
+
+      // we are a bit more tolerant to lower clock speeds because we don't want
+      // to cause sleeps and reruns all the time.
+      const float tolerance_lower_clock_speed = 0.98f;
+      bool rerun_last_tests = false;
+
+      if (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) {
+        cerr << "Measurements completed so far: "
+             << 100.0f * ratio_done
+             << " %                             " << endl;
+        cerr << "Clock speed seems to be only "
+             << current_clock_speed/max_clock_speed
+             << " times what it used to be." << endl;
+
+        unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
+
+        while (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) {
+          if (seconds_to_sleep_if_lower_clock_speed > 300) {
+            cerr << "Sleeping longer probably won't make a difference. Giving up." << endl;
+            cerr << "Things to try:" << endl;
+            cerr << " 1. Check if the device is in some energy-saving state." << endl;
+            cerr << "    On Android, it may help to enable 'Stay Awake' in the dev settings." << endl;
+            cerr << " 2. Check if the device is overheating." << endl;
+            cerr << "    On some devices, system temperature is reported in" << endl;
+            cerr << "    /sys/class/thermal/thermal_zone*/temp" << endl;
+            cerr << " 3. Some system daemon might be playing with clock speeds." << endl;
+            cerr << "    In particular, on Qualcomm devices, disable mpdecision " << endl;
+            cerr << "    by renaming /system/bin/mpdecision and rebooting." << endl;
+            cerr << " 4. CPU frequency scaling might conceivably be the problem." << endl;
+            cerr << "    In particular, Intel Turbo Boost. Try disabling that." << endl;
+            exit(1);
+          }
+          rerun_last_tests = true;
+          cerr << "Sleeping "
+               << seconds_to_sleep_if_lower_clock_speed
+               << " s..." << endl;
+          sleep(seconds_to_sleep_if_lower_clock_speed);
+          current_clock_speed = measure_clock_speed();
+          seconds_to_sleep_if_lower_clock_speed *= 2;
         }
       }
-      cerr << "                                                \r" << flush;
+
+      if (rerun_last_tests) {
+        cerr << "Redoing the last "
+             << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+             << " % because clock speed had been low.   " << endl;
+        return;
+      }
+
+      // nothing wrong with the clock speed so far, so there won't be a need to rerun
+      // benchmarks run so far in case we later encounter a lower clock speed.
+      first_benchmark_to_run = benchmark_index;
+    }
+
+    if (benchmark_index == benchmarks.size()) {
+      // We're done!
+      first_benchmark_to_run = benchmarks.size();
+      // Erase progress info
+      cerr << "                                                            " << endl;
+      return;
+    }
+
+    // Display progress info on stderr
+    if (time_now > time_last_progress_update + 1.0f) {
+      time_last_progress_update = time_now;
+      cerr << "Measurements... " << 100.0f * ratio_done
+           << " %, ETA "
+           << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
+           << "                          \r" << flush;
     }
 
     // This is where we actually run a benchmark!
-    benchmarks[i].run();
+    benchmarks[benchmark_index].run();
+    benchmark_index++;
   }
+}
 
-  // Erase progress info
-  cerr << "                                                            " << endl;
+void run_benchmarks(vector<benchmark_t>& benchmarks)
+{
+  // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+  // It also means that if data is corrupted for some time span, the odds are that
+  // not all repetitions of a given benchmark will be corrupted.
+  random_shuffle(benchmarks.begin(), benchmarks.end());
+
+  float max_clock_speed = 0.0f;
+  for (int i = 0; i < 4; i++) {
+    max_clock_speed = max(max_clock_speed, measure_clock_speed());
+  }
+  
+  double time_start = timer.getRealTime();
+  size_t first_benchmark_to_run = 0;
+  while (first_benchmark_to_run < benchmarks.size()) {
+    try_run_some_benchmarks(benchmarks,
+                            time_start,
+                            first_benchmark_to_run,
+                            max_clock_speed);
+  }
 
   // Sort timings by increasing benchmark parameters, and decreasing gflops.
   // The latter is very important. It means that we can ignore all but the first
@@ -414,6 +566,7 @@ struct measure_default_sizes_action_t : action_t
 
 int main(int argc, char* argv[])
 {
+  double time_start = timer.getRealTime();
   cout.precision(4);
   cerr.precision(4);
 
@@ -440,7 +593,7 @@ int main(int argc, char* argv[])
   for (int i = 2; i < argc; i++) {
     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
       const char* equals_sign = strchr(argv[i], '=');
-      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+      g_min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
     } else {
       cerr << "unrecognized option: " << argv[i] << endl << endl;
       show_usage_and_exit(argc, argv, available_actions);
@@ -451,17 +604,20 @@ int main(int argc, char* argv[])
 
   cout << "benchmark parameters:" << endl;
   cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
-  cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
+  cout << "scalar type: " << type_name<Scalar>() << endl;
   cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
   cout << "minsize = " << minsize << endl;
   cout << "maxsize = " << maxsize << endl;
   cout << "measurement_repetitions = " << measurement_repetitions << endl;
-  cout << "min_accurate_time = " << min_accurate_time << endl;
-  cout << "min_working_set_size = " << min_working_set_size;
-  if (min_working_set_size == 0) {
+  cout << "g_min_accurate_time = " << g_min_accurate_time << endl;
+  cout << "g_min_working_set_size = " << g_min_working_set_size;
+  if (g_min_working_set_size == 0) {
     cout << " (try to outsize caches)";
   }
   cout << endl << endl;
 
   (*action)->run();
+
+  double time_end = timer.getRealTime();
+  cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
 }

From 4ab01f7c21640f2fb82ecd0877b2d6c662aae7de Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Thu, 5 Mar 2015 14:41:16 -0500
Subject: [PATCH 31/83] slightly increase tolerance to clock speed variation

---
 bench/benchmark-blocking-sizes.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 0bf9c07f8..316aa9d4d 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -369,12 +369,14 @@ void try_run_some_benchmarks(
       // Ensure that clock speed is as expected
       float current_clock_speed = measure_clock_speed();
 
-      // we only allow 1% higher clock speeds, because we want to know the
-      // clock speed with good accuracy, and this should only cause restarts
-      // at the beginning of the benchmarks run.
-      const float tolerance_higher_clock_speed = 1.01f;
+      // The tolerance needs to be smaller than the relative difference between
+      // clock speeds that a device could operate under.
+      // It seems unlikely that a device would be throttling clock speeds by
+      // amounts smaller than 2%.
+      // With a value of 1%, I was getting within noise on a Sandy Bridge.
+      const float clock_speed_tolerance = 0.02f;
 
-      if (current_clock_speed > tolerance_higher_clock_speed * max_clock_speed) {
+      if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
         // Clock speed is now higher than we previously measured.
         // Either our initial measurement was inaccurate, which won't happen
         // too many times as we are keeping the best clock speed value and
@@ -390,12 +392,9 @@ void try_run_some_benchmarks(
         return;
       }
 
-      // we are a bit more tolerant to lower clock speeds because we don't want
-      // to cause sleeps and reruns all the time.
-      const float tolerance_lower_clock_speed = 0.98f;
       bool rerun_last_tests = false;
 
-      if (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) {
+      if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
         cerr << "Measurements completed so far: "
              << 100.0f * ratio_done
              << " %                             " << endl;
@@ -405,7 +404,7 @@ void try_run_some_benchmarks(
 
         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
 
-        while (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) {
+        while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
           if (seconds_to_sleep_if_lower_clock_speed > 300) {
             cerr << "Sleeping longer probably won't make a difference. Giving up." << endl;
             cerr << "Things to try:" << endl;
@@ -479,9 +478,12 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
     max_clock_speed = max(max_clock_speed, measure_clock_speed());
   }
   
-  double time_start = timer.getRealTime();
+  double time_start = 0.0;
   size_t first_benchmark_to_run = 0;
   while (first_benchmark_to_run < benchmarks.size()) {
+    if (first_benchmark_to_run == 0) {
+      time_start = timer.getRealTime();
+    }
     try_run_some_benchmarks(benchmarks,
                             time_start,
                             first_benchmark_to_run,

From 58740ce4c60c9230f1e030ae45508dba10ba1211 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 10:30:35 +0100
Subject: [PATCH 32/83] Improve product kernel: replace the previous dynamic
 loop swaping strategy by a more general one: It consists in increasing the
 actual number of rows of lhs's micro horizontal panel for small depth such
 that L1 cache is fully exploited.

---
 .../Core/products/GeneralBlockPanelKernel.h   | 87 ++++++++-----------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 8f4ee4dbb..72957cdc8 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -230,6 +230,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     {
       // So far, no blocking at all, i.e., kc==k, and nc==n.
       // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
       Index problem_size = k*n*sizeof(LhsScalar);
       Index actual_lm = actual_l2;
       Index max_mc = m;
@@ -951,33 +952,28 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // This corresponds to 3*LhsProgress x nr register blocks.
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
-    {
-#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
-      const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION;
-#else
-      const bool swap_loops = depth<48;
-#endif
-    
-      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc3;
-      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3;
-      Index incr1  =  swap_loops ? nr : 3*Traits::LhsProgress;
-      Index incr2  = !swap_loops ? nr : 3*Traits::LhsProgress;
-      
+    {      
       PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
-          
-      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
-      // and on each largest micro vertical panel of rhs (depth * nr)
-      for(Index it1=0; it1<bound1; it1+=incr1)
+      
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
+      const Index actual_panel_rows = (3*LhsProgress) *  ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) );
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
-        for(Index it2=0; it2<bound2; it2+=incr2)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          Index i  =  swap_loops ? it2 : it1;
-          Index j2 = !swap_loops ? it2 : it1;
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
           
-          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 3 x nr registers.
           
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
           prefetch(&blA[0]);
 
           // gets res block as register
@@ -1109,16 +1105,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C11, alphav, R2);
           r3.storePacket(0 * Traits::ResPacketSize, R0);
           r3.storePacket(1 * Traits::ResPacketSize, R1);
-          r3.storePacket(2 * Traits::ResPacketSize, R2);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
-      }
 
-      // Deal with remaining columns of the rhs
-      if(packet_cols4<cols)
-      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
-      {
+        // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
           prefetch(&blA[0]);
@@ -1189,7 +1184,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C8, alphav, R2);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
-          r0.storePacket(2 * Traits::ResPacketSize, R2);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
       }
     }
@@ -1197,26 +1193,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
-#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION
-      const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION);
-#else
-      const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48);
-#endif 
-      Index start1 =  swap_loops ? 0 : peeled_mc3;
-      Index start2 = !swap_loops ? 0 : peeled_mc3;
-      Index bound1 =  swap_loops ? packet_cols4 : peeled_mc2;
-      Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2;
-      Index incr1  =  swap_loops ? nr : 2*Traits::LhsProgress;
-      Index incr2  = !swap_loops ? nr : 2*Traits::LhsProgress;
-      
-      for(Index it1=start1; it1<bound1; it1+=incr1)
+      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
+      Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) );
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
-        for(Index it2=start2; it2<bound2; it2+=incr2)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          Index i  =  swap_loops ? it2 : it1;
-          Index j2 = !swap_loops ? it2 : it1;
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
           
-          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
           // stored into 2 x nr registers.
           
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
@@ -1320,15 +1307,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r3.storePacket(0 * Traits::ResPacketSize, R2);
           r3.storePacket(1 * Traits::ResPacketSize, R3);
+          }
         }
-      }
       
-      // Deal with remaining columns of the rhs
-      if(packet_cols4<cols)
-      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*Traits::LhsProgress)
-      {
+        // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
           // One column at a time
           const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
           prefetch(&blA[0]);
@@ -1395,6 +1381,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
         }
       }
     }

From eedd5063fd5c28bf56a9ead96ea2c0df336e0158 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 11:47:13 +0100
Subject: [PATCH 33/83] Update gemm performance monitoring tool:  - permit to
 recompute a subset of changesets  - update changeset list  - add a few more
 cases

---
 bench/perf_monitoring/gemm/changesets.txt | 67 ++++++++++++-----------
 bench/perf_monitoring/gemm/run_gemm.sh    | 47 +++++++++++++---
 bench/perf_monitoring/gemm/settings.txt   |  4 ++
 3 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index f19b4287d..b379d7bd2 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -1,39 +1,42 @@
-3.0.1
-3.1.1
-3.2.0
+#3.0.1
+#3.1.1
+#3.2.0
 3.2.4
-5745:37f59e65eb6c
-5891:d8652709345d
-5893:24b4dc92c6d3
-5895:997c2ef9fc8b
-5904:e1eafd14eaa1
-5908:f8ee3c721251
-5921:ca808bb456b0
-5927:8b1001f9e3ac
-5937:5a4ca1ad8c53
-5949:f3488f4e45b2
-5969:e09031dccfd9
-5992:4a429f5e0483
+#5745:37f59e65eb6c
+5891:d8652709345d  # introduce AVX
+#5893:24b4dc92c6d3  # merge
+5895:997c2ef9fc8b  # introduce FMA
+#5904:e1eafd14eaa1  # complex and AVX
+5908:f8ee3c721251  # improve packing with ptranspose
+#5921:ca808bb456b0  # merge
+#5927:8b1001f9e3ac
+5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
+#5949:f3488f4e45b2  # merge
+#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
+#5992:4a429f5e0483  # merge
 before-evaluators
-6334:f6a45e5b8b7c
-6639:c9121c60b5c7
-6655:06f163b5221f
-6677:700e023044e7   # FMA has been wrongly disabled
-6681:11d31dafb0e3
-6699:5e6e8e10aad1   # merge default to tensors
-6726:ff2d2388e7b9   # merge default to tensors
-6742:0cbd6195e829   # merge default to tensors
-6747:853d2bafeb8f   # Generalized the gebp apis
+#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
+#6639:c9121c60b5c7
+#6655:06f163b5221f  # Properly detect FMA support on ARM
+#6677:700e023044e7   # FMA has been wrongly disabled
+#6681:11d31dafb0e3
+#6699:5e6e8e10aad1   # merge default to tensors
+#6726:ff2d2388e7b9   # merge default to tensors
+#6742:0cbd6195e829   # merge default to tensors
+#6747:853d2bafeb8f   # Generalized the gebp apis
 6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
-6781:9cc5a931b2c6   # generalized gemv
-6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
-6844:039efd86b75c   # merge tensor
+#6781:9cc5a931b2c6   # generalized gemv
+#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+#6844:039efd86b75c   # merge tensor
 6845:7333ed40c6ef   # change prefetching in gebp
-6856:b5be5e10eb7f   # merge index conversion
-6893:c3a64aba7c70   # clean blocking size computation
-6898:6fb31ebe6492   # rotating kernel for ARM
+#6856:b5be5e10eb7f   # merge index conversion
+#6893:c3a64aba7c70   # clean blocking size computation
+#6898:6fb31ebe6492   # rotating kernel for ARM
 6899:877facace746   # rotating kernel for ARM only
-6904:c250623ae9fa   # result_of
+#6904:c250623ae9fa   # result_of
 6921:915f1b1fc158   # fix prefetching change for ARM
 6923:9ff25f6dacc6   # prefetching
-6933:52572e60b5d3   # blocking size strategy
\ No newline at end of file
+6933:52572e60b5d3   # blocking size strategy
+6937:c8c042f286b2   # avoid redundant pack_rhs
+6981:7e5d6f78da59   # dynamic loop swapping
+6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh
index d3a9fadc9..3fa6a3661 100755
--- a/bench/perf_monitoring/gemm/run_gemm.sh
+++ b/bench/perf_monitoring/gemm/run_gemm.sh
@@ -6,6 +6,7 @@
 
 # Options:
 #   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
 
 
 if echo "$*" | grep '\-up' > /dev/null; then
@@ -14,14 +15,30 @@ else
   update=false
 fi
 
-if [ $update == true ]; then
+if echo "$*" | grep '\-s' > /dev/null; then
+  selected=true
+else
+  selected=false
+fi
+
+global_args="$*"
+
+if [ $selected == true ]; then
+ echo "Recompute selected changesets only and keep bests"
+elif [ $update == true ]; then
  echo "(Re-)Compute all changesets and keep bests"
 else
  echo "Skip previously computed changesets"
 fi
 
+
+
 if [ ! -d "eigen_src" ]; then
   hg clone https://bitbucket.org/eigen/eigen eigen_src
+else
+  cd eigen_src
+  hg pull -u
+  cd ..
 fi
 
 if [ ! -z '$CXX' ]; then
@@ -61,17 +78,31 @@ function test_current
   scalar=$2
   name=$3
   
-  prev=`grep $rev "$name.backup" | cut -c 14-`
+  prev=""
+  if [ -e "$name.backup" ]; then
+    prev=`grep $rev "$name.backup" | cut -c 14-`
+  fi
   res=$prev
   count_rev=`echo $prev |  wc -w`
   count_ref=`cat "settings.txt" |  wc -l`
-  if [ $update == true ] || [ $count_rev != $count_ref ]; then
+  if echo "$global_args" | grep "$rev" > /dev/null; then
+    rev_found=true
+  else
+    rev_found=false
+  fi
+#  echo $update et $selected et $rev_found because $rev et "$global_args"
+#  echo $count_rev et $count_ref
+  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
     if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
       curr=`./$name`
-      echo merge $prev
-      echo with  $curr
+      if [ $count_rev == $count_ref ]; then
+        echo "merge previous $prev"
+        echo "with new       $curr"
+      else
+        echo "got            $curr"
+      fi
       res=`merge "$curr" "$prev"`
-      echo $res
+#       echo $res
       echo "$rev $res" >> $name.out
     else
       echo "Compilation failed, skip rev $rev"
@@ -86,12 +117,12 @@ make_backup $PREFIX"sgemm"
 make_backup $PREFIX"dgemm"
 make_backup $PREFIX"cgemm"
 
-cut -f1 -d"#" < changesets.txt | while read rev
+cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
 do
   if [ ! -z '$rev' ]; then
     echo "Testing rev $rev"
     cd eigen_src
-    hg up -C $rev
+    hg up -C $rev > /dev/null
     actual_rev=`hg identify | cut -f1 -d' '`
     cd ..
     
diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt
index 6ef690708..5c43e1c7d 100644
--- a/bench/perf_monitoring/gemm/settings.txt
+++ b/bench/perf_monitoring/gemm/settings.txt
@@ -1,5 +1,6 @@
 8 8 8
 9 9 9
+24 24 24
 239 239 239
 240 240 240
 2400 24 24
@@ -8,4 +9,7 @@
 24 2400 2400
 2400 24 2400
 2400 2400 24
+2400 2400 64
+4800 23 160
+23 4800 160
 2400 2400 2400

From cd3bbffa73a7bc33e6c3f3a63ab2e9eed7cc322c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 14:31:39 +0100
Subject: [PATCH 34/83] Improve blocking heuristic: if the lhs fit within L1,
 then block on the rhs in L1 (allows to keep packed rhs in L1)

---
 .../Core/products/GeneralBlockPanelKernel.h    | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 72957cdc8..cfeeb27f8 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -213,8 +213,22 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
-    // to limit this growth: we bound nc to growth by a factor x1.5, leading to:
-    const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
     // WARNING Below, we assume that Traits::nr is a power of two.
     Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
     if(n>nc)

From 4c8eeeaed6e5fe3f62c06e743971cefb5eaff359 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 15:08:20 +0100
Subject: [PATCH 35/83] update gemm changeset list

---
 bench/benchmark-blocking-sizes.cpp        | 2 +-
 bench/perf_monitoring/gemm/changesets.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 0bf9c07f8..465f5d2c8 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -199,7 +199,7 @@ void benchmark_t::run()
 
     double starttime = timer.getCpuTime();
     for (int i = 0; i < iters_at_a_time; i++) {
-      dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index];
+      dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
       matrix_index++;
       if (matrix_index == matrix_pool_size) {
         matrix_index = 0;
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index b379d7bd2..a5b63bc89 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -40,3 +40,4 @@ before-evaluators
 6937:c8c042f286b2   # avoid redundant pack_rhs
 6981:7e5d6f78da59   # dynamic loop swapping
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
+6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.

From 87681e508f1da5064f5b2699f17049306ac9a2ca Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 16:11:22 +0100
Subject: [PATCH 36/83] bug #978: early return for vanishing products

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c76f48154..7fd707ed7 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -457,6 +457,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
   {
     eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
 
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);

From d23fcc0672d9edd917cfca0e32ea6d14300b492b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 16:12:08 +0100
Subject: [PATCH 37/83] bug #978: add unit test for zero-sized products

---
 test/product_extra.cpp | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index 744a1ef7f..1b4c6c33c 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -109,8 +109,33 @@ void mat_mat_scalar_scalar_product()
   double det = 6.0, wt = 0.5;
   VERIFY_IS_APPROX(dNdxy.transpose()*dNdxy*det*wt, det*wt*dNdxy.transpose()*dNdxy);
 }
+
+template <typename MatrixType> 
+void zero_sized_objects(const MatrixType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
   
-void zero_sized_objects()
+  {
+    MatrixType res, a(rows,0), b(0,cols);
+    VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(rows,cols) );
+    VERIFY_IS_APPROX( (res=a*a.transpose()), MatrixType::Zero(rows,rows) );
+    VERIFY_IS_APPROX( (res=b.transpose()*b), MatrixType::Zero(cols,cols) );
+    VERIFY_IS_APPROX( (res=b.transpose()*a.transpose()), MatrixType::Zero(cols,rows) );
+  }
+  
+  {
+    MatrixType res, a(rows,cols), b(cols,0);
+    res = a*b;
+    VERIFY(res.rows()==rows && res.cols()==0);
+    b.resize(0,rows);
+    res = b*a;
+    VERIFY(res.rows()==0 && res.cols()==cols);
+  }
+}
+
+
+void bug_127()
 {
   // Bug 127
   //
@@ -171,7 +196,8 @@ void test_product_extra()
     CALL_SUBTEST_2( mat_mat_scalar_scalar_product() );
     CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
-  CALL_SUBTEST_5( zero_sized_objects() );
+  CALL_SUBTEST_5( bug_127() );
   CALL_SUBTEST_6( unaligned_objects() );
 }

From 14a5f135a32eed85033a5061d96e2d45e23d50aa Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 17:51:31 +0100
Subject: [PATCH 38/83] bug #969: workaround abiguous calls to Ref using
 enable_if.

---
 Eigen/src/Core/Ref.h | 16 +++++++++++-----
 test/ref.cpp         | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 0cb117949..ea5a2bd5c 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -105,7 +105,8 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
       OuterStrideMatch = Derived::IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
       AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
@@ -184,9 +185,11 @@ protected:
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
     typedef internal::traits<Ref> Traits;
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr);
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
   public:
 
     typedef RefBase<Ref> Base;
@@ -195,13 +198,15 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       Base::construct(expr.derived());
     }
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
@@ -228,7 +233,8 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
diff --git a/test/ref.cpp b/test/ref.cpp
index b9470213c..fbe2c450f 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -228,6 +228,28 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0);
 }
 
+typedef Matrix<double,Dynamic,Dynamic,RowMajor> RowMatrixXd;
+int test_ref_overload_fun1(Ref<MatrixXd> )       { return 1; }
+int test_ref_overload_fun1(Ref<RowMatrixXd> )    { return 2; }
+int test_ref_overload_fun1(Ref<MatrixXf> )       { return 3; }
+
+int test_ref_overload_fun2(Ref<const MatrixXd> ) { return 4; }
+int test_ref_overload_fun2(Ref<const MatrixXf> ) { return 5; }
+
+// See also bug 969
+void test_ref_overloads()
+{
+  MatrixXd Ad, Bd;
+  RowMatrixXd rAd, rBd;
+  VERIFY( test_ref_overload_fun1(Ad)==1 );
+  VERIFY( test_ref_overload_fun1(rAd)==2 );
+  
+  MatrixXf Af, Bf;
+  VERIFY( test_ref_overload_fun2(Ad)==4 );
+  VERIFY( test_ref_overload_fun2(Ad+Bd)==4 );
+  VERIFY( test_ref_overload_fun2(Af+Bf)==5 );
+}
+
 void test_ref()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -248,4 +270,6 @@ void test_ref()
     CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
     CALL_SUBTEST_6( call_ref() );
   }
+  
+  CALL_SUBTEST_7( test_ref_overloads() );
 }

From 0ee391863ed8bee95356fad8b7202177290e0338 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Mar 2015 21:51:09 +0100
Subject: [PATCH 39/83] Avoid undeflow when blocking size are tuned manually.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index cfeeb27f8..408281c82 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -975,7 +975,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
       // However, if depth is too small, we can extend the number of rows of these horizontal panels.
       // This actual number of rows is computed as follow:
       const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
-      const Index actual_panel_rows = (3*LhsProgress) *  ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) );
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+#else
+      const Index actual_panel_rows = (3*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) );
+#endif
       for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
         const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
@@ -1208,7 +1212,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     if(mr>=2*Traits::LhsProgress)
     {
       const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+#else
       Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) );
+#endif
       for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
         Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);

From 19bf13aa626b507a176a33a86df83a4b67cb9ede Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 6 Mar 2015 19:11:50 -0500
Subject: [PATCH 40/83] Automatically serialize partial results to disk,
 reboot, and resume, when timings are getting bad

---
 bench/benchmark-blocking-sizes.cpp | 147 ++++++++++++++++++++---------
 1 file changed, 105 insertions(+), 42 deletions(-)

diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 316aa9d4d..e5932b297 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -13,6 +13,7 @@
 #include <vector>
 #include <fstream>
 #include <memory>
+#include <cstdio>
 
 bool eigen_use_specific_block_size;
 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
@@ -37,10 +38,12 @@ const int measurement_repetitions = 3;
 // Timings below this value are too short to be accurate,
 // we'll repeat measurements with more iterations until
 // we get a timing above that threshold.
-const float g_min_accurate_time = 1e-2f;
+const float min_accurate_time = 1e-2f;
 
 // See --min-working-set-size command line parameter.
-size_t g_min_working_set_size = 0;
+size_t min_working_set_size = 0;
+
+float max_clock_speed = 0.0f;
 
 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
 const size_t maxsize = 2048;
@@ -97,24 +100,25 @@ struct benchmark_t
   uint16_t compact_block_size;
   bool use_default_block_size;
   float gflops;
-  size_t min_working_set_size;
-  float min_accurate_time;
+  benchmark_t()
+    : compact_product_size(0)
+    , compact_block_size(0)
+    , use_default_block_size(false)
+    , gflops(0)
+  {
+  }
   benchmark_t(size_t pk, size_t pm, size_t pn,
               size_t bk, size_t bm, size_t bn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(compact_size_triple(bk, bm, bn))
     , use_default_block_size(false)
     , gflops(0)
-    , min_working_set_size(g_min_working_set_size)
-    , min_accurate_time(g_min_accurate_time)
   {}
   benchmark_t(size_t pk, size_t pm, size_t pn)
     : compact_product_size(compact_size_triple(pk, pm, pn))
     , compact_block_size(0)
     , use_default_block_size(true)
     , gflops(0)
-    , min_working_set_size(g_min_working_set_size)
-    , min_accurate_time(g_min_accurate_time)
   {}
 
   void run();
@@ -296,11 +300,7 @@ float measure_clock_speed()
           
   vector<float> all_gflops;
   for (int i = 0; i < 8; i++) {
-    // a good measure of clock speed is obtained by benchmarking small matrices that
-    // fit in L1 cache and use warm caches (min_working_set_size = 1).
-    benchmark_t b(128, 128, 128);
-    b.min_working_set_size = 1;
-    b.min_accurate_time = 0.1f; // long-running for better accuracy
+    benchmark_t b(1024, 1024, 1024);
     b.run();
     all_gflops.push_back(b.gflops);
   }
@@ -340,11 +340,52 @@ ostream& operator<<(ostream& s, const human_duration_t& d)
   return s;
 }
 
+const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
+
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "w");
+  if (!file) {
+    cerr << "Could not open file " << filename << " for writing." << endl;
+    cerr << "Do you have write permissions on the current working directory?" << endl;
+    exit(1);
+  }
+  size_t benchmarks_vector_size = benchmarks.size();
+  fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
+  fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
+  fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
+  fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
+  fclose(file);
+}
+
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
+{
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return false;
+  }
+  if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
+    return false;
+  }
+  size_t benchmarks_vector_size = 0;
+  if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
+    return false;
+  }
+  if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
+    return false;
+  }
+  benchmarks.resize(benchmarks_vector_size);
+  if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
+    return false;
+  }
+  unlink(filename);
+  return true;
+}
+
 void try_run_some_benchmarks(
   vector<benchmark_t>& benchmarks,
   double time_start,
-  size_t& first_benchmark_to_run,
-  float& max_clock_speed)
+  size_t& first_benchmark_to_run)
 {
   if (first_benchmark_to_run == benchmarks.size()) {
     return;
@@ -405,20 +446,12 @@ void try_run_some_benchmarks(
         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
 
         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
-          if (seconds_to_sleep_if_lower_clock_speed > 300) {
-            cerr << "Sleeping longer probably won't make a difference. Giving up." << endl;
-            cerr << "Things to try:" << endl;
-            cerr << " 1. Check if the device is in some energy-saving state." << endl;
-            cerr << "    On Android, it may help to enable 'Stay Awake' in the dev settings." << endl;
-            cerr << " 2. Check if the device is overheating." << endl;
-            cerr << "    On some devices, system temperature is reported in" << endl;
-            cerr << "    /sys/class/thermal/thermal_zone*/temp" << endl;
-            cerr << " 3. Some system daemon might be playing with clock speeds." << endl;
-            cerr << "    In particular, on Qualcomm devices, disable mpdecision " << endl;
-            cerr << "    by renaming /system/bin/mpdecision and rebooting." << endl;
-            cerr << " 4. CPU frequency scaling might conceivably be the problem." << endl;
-            cerr << "    In particular, Intel Turbo Boost. Try disabling that." << endl;
-            exit(1);
+          if (seconds_to_sleep_if_lower_clock_speed > 30) {
+            cerr << "Sleeping longer probably won't make a difference." << endl;
+            cerr << "Serializing benchmarks to " << session_filename << endl;
+            serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
+            cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
+            exit(2);
           }
           rerun_last_tests = true;
           cerr << "Sleeping "
@@ -467,27 +500,57 @@ void try_run_some_benchmarks(
 
 void run_benchmarks(vector<benchmark_t>& benchmarks)
 {
-  // Randomly shuffling benchmarks allows us to get accurate enough progress info,
-  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
-  // It also means that if data is corrupted for some time span, the odds are that
-  // not all repetitions of a given benchmark will be corrupted.
-  random_shuffle(benchmarks.begin(), benchmarks.end());
+  size_t first_benchmark_to_run;
+  vector<benchmark_t> deserialized_benchmarks;
+  bool use_deserialized_benchmarks = false;
+  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
+    cerr << "Found serialized session with "
+         << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+         << " % already done" << endl;
+    if (deserialized_benchmarks.size() == benchmarks.size() &&
+        first_benchmark_to_run > 0 &&
+        first_benchmark_to_run < benchmarks.size())
+    {
+      bool found_mismatch = false;
+      for (size_t i = 0; i < benchmarks.size(); i++) {
+        if (deserialized_benchmarks[i].compact_product_size != benchmarks[i].compact_product_size ||
+            deserialized_benchmarks[i].compact_block_size != benchmarks[i].compact_block_size ||
+            deserialized_benchmarks[i].use_default_block_size != benchmarks[i].use_default_block_size)
+        {
+          cerr << "Mismatch in serialized session. Ignoring it." << endl;
+          found_mismatch = true;
+          break;
+        }
+      }
+      use_deserialized_benchmarks = !found_mismatch;
+    }
+  }
+
+  if (use_deserialized_benchmarks) {
+    benchmarks = deserialized_benchmarks;
+  } else {
+    // not using deserialized benchmarks, starting from scratch
+    first_benchmark_to_run = 0;
+
+    // Randomly shuffling benchmarks allows us to get accurate enough progress info,
+    // as now the cheap/expensive benchmarks are randomly mixed so they average out.
+    // It also means that if data is corrupted for some time span, the odds are that
+    // not all repetitions of a given benchmark will be corrupted.
+    random_shuffle(benchmarks.begin(), benchmarks.end());
+  }
 
-  float max_clock_speed = 0.0f;
   for (int i = 0; i < 4; i++) {
     max_clock_speed = max(max_clock_speed, measure_clock_speed());
   }
   
   double time_start = 0.0;
-  size_t first_benchmark_to_run = 0;
   while (first_benchmark_to_run < benchmarks.size()) {
     if (first_benchmark_to_run == 0) {
       time_start = timer.getRealTime();
     }
     try_run_some_benchmarks(benchmarks,
                             time_start,
-                            first_benchmark_to_run,
-                            max_clock_speed);
+                            first_benchmark_to_run);
   }
 
   // Sort timings by increasing benchmark parameters, and decreasing gflops.
@@ -595,7 +658,7 @@ int main(int argc, char* argv[])
   for (int i = 2; i < argc; i++) {
     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
       const char* equals_sign = strchr(argv[i], '=');
-      g_min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
     } else {
       cerr << "unrecognized option: " << argv[i] << endl << endl;
       show_usage_and_exit(argc, argv, available_actions);
@@ -611,9 +674,9 @@ int main(int argc, char* argv[])
   cout << "minsize = " << minsize << endl;
   cout << "maxsize = " << maxsize << endl;
   cout << "measurement_repetitions = " << measurement_repetitions << endl;
-  cout << "g_min_accurate_time = " << g_min_accurate_time << endl;
-  cout << "g_min_working_set_size = " << g_min_working_set_size;
-  if (g_min_working_set_size == 0) {
+  cout << "min_accurate_time = " << min_accurate_time << endl;
+  cout << "min_working_set_size = " << min_working_set_size;
+  if (min_working_set_size == 0) {
     cout << " (try to outsize caches)";
   }
   cout << endl << endl;

From 39228cb22414209e1880d84699b5f8e481c2b436 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 6 Mar 2015 19:29:01 -0500
Subject: [PATCH 41/83] deserialization assumed benchmarks in same order, but
 we shuffle them.

---
 bench/benchmark-blocking-sizes.cpp | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 83e973a69..33d3824eb 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -511,18 +511,7 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
         first_benchmark_to_run > 0 &&
         first_benchmark_to_run < benchmarks.size())
     {
-      bool found_mismatch = false;
-      for (size_t i = 0; i < benchmarks.size(); i++) {
-        if (deserialized_benchmarks[i].compact_product_size != benchmarks[i].compact_product_size ||
-            deserialized_benchmarks[i].compact_block_size != benchmarks[i].compact_block_size ||
-            deserialized_benchmarks[i].use_default_block_size != benchmarks[i].use_default_block_size)
-        {
-          cerr << "Mismatch in serialized session. Ignoring it." << endl;
-          found_mismatch = true;
-          break;
-        }
-      }
-      use_deserialized_benchmarks = !found_mismatch;
+      use_deserialized_benchmarks = true;
     }
   }
 

From cf9940e17b5799c35b6a05764bec166d07906b6f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Mar 2015 13:54:05 +0100
Subject: [PATCH 42/83] Make sparse unit-test helpers aware of StorageIndex

---
 test/sparse.h        | 14 +++++++-------
 test/sparse_solver.h | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/sparse.h b/test/sparse.h
index 81ab9e873..3c3a0c9be 100644
--- a/test/sparse.h
+++ b/test/sparse.h
@@ -53,15 +53,15 @@ enum {
  * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero,
  *        and zero coefficients respectively.
  */
-template<typename Scalar,int Opt1,int Opt2,typename Index> void
+template<typename Scalar,int Opt1,int Opt2,typename StorageIndex> void
 initSparse(double density,
            Matrix<Scalar,Dynamic,Dynamic,Opt1>& refMat,
-           SparseMatrix<Scalar,Opt2,Index>& sparseMat,
+           SparseMatrix<Scalar,Opt2,StorageIndex>& sparseMat,
            int flags = 0,
-           std::vector<Matrix<Index,2,1> >* zeroCoords = 0,
-           std::vector<Matrix<Index,2,1> >* nonzeroCoords = 0)
+           std::vector<Matrix<StorageIndex,2,1> >* zeroCoords = 0,
+           std::vector<Matrix<StorageIndex,2,1> >* nonzeroCoords = 0)
 {
-  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,Index>::IsRowMajor };
+  enum { IsRowMajor = SparseMatrix<Scalar,Opt2,StorageIndex>::IsRowMajor };
   sparseMat.setZero();
   //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density));
   sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows()))));
@@ -93,11 +93,11 @@ initSparse(double density,
         //sparseMat.insertBackByOuterInner(j,i) = v;
         sparseMat.insertByOuterInner(j,i) = v;
         if (nonzeroCoords)
-          nonzeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+          nonzeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       else if (zeroCoords)
       {
-        zeroCoords->push_back(Matrix<Index,2,1> (ai,aj));
+        zeroCoords->push_back(Matrix<StorageIndex,2,1> (ai,aj));
       }
       refMat(ai,aj) = v;
     }
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index f266e2c9a..a078851c3 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -230,7 +230,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -304,12 +304,12 @@ template<typename Solver> void check_sparse_spd_determinant(Solver& solver)
 }
 
 template<typename Solver, typename DenseMat>
-int generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
+Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
 
-  int size = internal::random<int>(1,maxSize);
+  Index size = internal::random<int>(1,maxSize);
   double density = (std::max)(8./(size*size), 0.01);
   
   A.resize(size,size);
@@ -324,7 +324,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -333,7 +333,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
   Mat A;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    int size = generate_sparse_square_problem(solver, A, dA);
+    Index size = generate_sparse_square_problem(solver, A, dA);
 
     A.makeCompressed();
     DenseVector b = DenseVector::Random(size);
@@ -430,7 +430,7 @@ template<typename Solver> void check_sparse_leastsquare_solving(Solver& solver)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor> SpMat;
+  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 

From 224a1fe4c6991c863986d8c8bd3d41af5aa4ff80 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Mar 2015 13:55:20 +0100
Subject: [PATCH 43/83] bug #963: make IncompleteLUT compatible with
 non-default storage index types.

---
 .../IterativeLinearSolvers/IncompleteLUT.h    | 41 ++++++++++---------
 test/bicgstab.cpp                             | 13 +++---
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 6d63d45e4..b7f8debb3 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -93,21 +93,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * alternatively, on GMANE:
   *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
   */
-template <typename _Scalar>
-class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar> >
+template <typename _Scalar, typename _StorageIndex = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageIndex> >
 {
   protected:
-    typedef SparseSolverBase<IncompleteLUT<_Scalar> > Base;
+    typedef SparseSolverBase<IncompleteLUT> Base;
     using Base::m_isInitialized;
   public:
     typedef _Scalar Scalar;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
 
   public:
+    
+    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
     typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
     
     IncompleteLUT()
@@ -151,7 +153,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar> >
       * 
       **/
     template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    IncompleteLUT& compute(const MatrixType& amat)
     {
       analyzePattern(amat); 
       factorize(amat);
@@ -197,8 +199,8 @@ protected:
  * Set control parameter droptol
  *  \param droptol   Drop any element whose magnitude is less than this tolerance 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
   this->m_droptol = droptol;   
 }
@@ -207,15 +209,15 @@ void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
  * Set control parameter fillfactor
  * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
   this->m_fillfactor = fillfactor;   
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
@@ -232,9 +234,9 @@ void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
   m_analysisIsOk = true;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
 {
   using std::sqrt;
   using std::swap;
@@ -246,8 +248,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   m_lu.resize(n,n);
   // Declare Working vectors and variables
   Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  VectorI ju(n);   // column position of the values in u -- maximum size  is n
+  VectorI jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
@@ -398,7 +400,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
@@ -427,14 +429,13 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizeu = len + 1; // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
     typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
     for(Index k = ii + 1; k < ii + len; k++)
       m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index f327e2fac..6d76389ce 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -10,11 +10,11 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_bicgstab_T()
+template<typename T, typename I> void test_bicgstab_T()
 {
-  BiCGSTAB<SparseMatrix<T>, DiagonalPreconditioner<T> > bicgstab_colmajor_diag;
-  BiCGSTAB<SparseMatrix<T>, IdentityPreconditioner    > bicgstab_colmajor_I;
-  BiCGSTAB<SparseMatrix<T>, IncompleteLUT<T> >           bicgstab_colmajor_ilut;
+  BiCGSTAB<SparseMatrix<T,0,I>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
+  BiCGSTAB<SparseMatrix<T,0,I>, IdentityPreconditioner    >     bicgstab_colmajor_I;
+  BiCGSTAB<SparseMatrix<T,0,I>, IncompleteLUT<T,I> >              bicgstab_colmajor_ilut;
   //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
 
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag)  );
@@ -25,6 +25,7 @@ template<typename T> void test_bicgstab_T()
 
 void test_bicgstab()
 {
-  CALL_SUBTEST_1(test_bicgstab_T<double>());
-  CALL_SUBTEST_2(test_bicgstab_T<std::complex<double> >());
+  CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
+  CALL_SUBTEST_1((test_bicgstab_T<double,long int>()));
+  CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
 }

From 9e885fb76608c8d90da20151db6459a99be360d6 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Mar 2015 14:33:15 +0100
Subject: [PATCH 44/83] Add unit tests for CG and sparse-LLT for long int as
 storage-index

---
 Eigen/src/OrderingMethods/Ordering.h          |  4 ++--
 Eigen/src/SparseCholesky/SimplicialCholesky.h |  5 ++--
 test/bicgstab.cpp                             |  2 +-
 test/conjugate_gradient.cpp                   | 18 +++++++-------
 test/simplicial_cholesky.cpp                  | 24 ++++++++++---------
 5 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index e88e637a4..cb838d04a 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -90,11 +90,11 @@ class AMDOrdering
   * \note Returns an empty permutation matrix
   * \tparam  Index The type of indices of the matrix 
   */
-template <typename Index>
+template <typename StorageIndex>
 class NaturalOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a column-major sparse matrix */
     template <typename MatrixType>
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index a0815e708..f56298e8c 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -69,6 +69,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
 
   public:
     
@@ -250,8 +251,8 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     CholMatrixType m_matrix;
     VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
-    VectorXi m_parent;                                // elimination tree
-    VectorXi m_nonZerosPerCol;
+    VectorI m_parent;                                 // elimination tree
+    VectorI m_nonZerosPerCol;
     PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // the permutation
     PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // the inverse permutation
 
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index 6d76389ce..7a9a11330 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -26,6 +26,6 @@ template<typename T, typename I> void test_bicgstab_T()
 void test_bicgstab()
 {
   CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
-  CALL_SUBTEST_1((test_bicgstab_T<double,long int>()));
   CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_bicgstab_T<double,long int>()));
 }
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 019cc4d64..9622fd86d 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -10,13 +10,14 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T> void test_conjugate_gradient_T()
+template<typename T, typename I> void test_conjugate_gradient_T()
 {
-  ConjugateGradient<SparseMatrix<T>, Lower      > cg_colmajor_lower_diag;
-  ConjugateGradient<SparseMatrix<T>, Upper      > cg_colmajor_upper_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag;
-  ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
-  ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrixType, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrixType, Lower|Upper> cg_colmajor_loup_diag;
+  ConjugateGradient<SparseMatrixType, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
+  ConjugateGradient<SparseMatrixType, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
@@ -27,6 +28,7 @@ template<typename T> void test_conjugate_gradient_T()
 
 void test_conjugate_gradient()
 {
-  CALL_SUBTEST_1(test_conjugate_gradient_T<double>());
-  CALL_SUBTEST_2(test_conjugate_gradient_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
+  CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_conjugate_gradient_T<double,long int>() ));
 }
diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
index 786468421..b7cc2d351 100644
--- a/test/simplicial_cholesky.cpp
+++ b/test/simplicial_cholesky.cpp
@@ -9,16 +9,17 @@
 
 #include "sparse_solver.h"
 
-template<typename T> void test_simplicial_cholesky_T()
+template<typename T, typename I> void test_simplicial_cholesky_T()
 {
-  SimplicialCholesky<SparseMatrix<T>, Lower> chol_colmajor_lower_amd;
-  SimplicialCholesky<SparseMatrix<T>, Upper> chol_colmajor_upper_amd;
-  SimplicialLLT<SparseMatrix<T>, Lower> llt_colmajor_lower_amd;
-  SimplicialLLT<SparseMatrix<T>, Upper> llt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower> ldlt_colmajor_lower_amd;
-  SimplicialLDLT<SparseMatrix<T>, Upper> ldlt_colmajor_upper_amd;
-  SimplicialLDLT<SparseMatrix<T>, Lower, NaturalOrdering<int> > ldlt_colmajor_lower_nat;
-  SimplicialLDLT<SparseMatrix<T>, Upper, NaturalOrdering<int> > ldlt_colmajor_upper_nat;
+  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  SimplicialCholesky<SparseMatrixType, Lower> chol_colmajor_lower_amd;
+  SimplicialCholesky<SparseMatrixType, Upper> chol_colmajor_upper_amd;
+  SimplicialLLT<     SparseMatrixType, Lower> llt_colmajor_lower_amd;
+  SimplicialLLT<     SparseMatrixType, Upper> llt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower> ldlt_colmajor_lower_amd;
+  SimplicialLDLT<    SparseMatrixType, Upper> ldlt_colmajor_upper_amd;
+  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I> > ldlt_colmajor_lower_nat;
+  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I> > ldlt_colmajor_upper_nat;
 
   check_sparse_spd_solving(chol_colmajor_lower_amd);
   check_sparse_spd_solving(chol_colmajor_upper_amd);
@@ -40,6 +41,7 @@ template<typename T> void test_simplicial_cholesky_T()
 
 void test_simplicial_cholesky()
 {
-  CALL_SUBTEST_1(test_simplicial_cholesky_T<double>());
-  CALL_SUBTEST_2(test_simplicial_cholesky_T<std::complex<double> >());
+  CALL_SUBTEST_1(( test_simplicial_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_simplicial_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_simplicial_cholesky_T<double,long int>() ));
 }

From d4317a85e8266fd9f840986821a2ca1cf673262b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Mar 2015 21:29:46 +0100
Subject: [PATCH 45/83] Add typedefs for return types of
 SparseMatrixBase::selfadjointView

---
 Eigen/src/SparseCore/SparseMatrixBase.h      | 10 +++++++---
 Eigen/src/SparseCore/SparseSelfAdjointView.h | 12 ++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index d76dfa33d..55b0ad9d2 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -97,7 +97,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
                         Transpose<const Derived>
                      >::type AdjointReturnType;
     typedef Transpose<Derived> TransposeReturnType;
-    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
     typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
 
     // FIXME storage order do not match evaluator storage order
@@ -300,9 +299,14 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 
     template<int Mode>
     inline const TriangularView<const Derived, Mode> triangularView() const;
+    
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<const Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+    template<unsigned int UpLo> inline 
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> inline
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
 
     template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 6467d4894..3da856799 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -169,17 +169,17 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 ***************************************************************************/
 
 template<typename Derived>
-template<unsigned int Mode>
-const SparseSelfAdjointView<const Derived, Mode> SparseMatrixBase<Derived>::selfadjointView() const
+template<unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView() const
 {
-  return SparseSelfAdjointView<const Derived, Mode>(derived());
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
 template<typename Derived>
-template<unsigned int Mode>
-SparseSelfAdjointView<Derived, Mode> SparseMatrixBase<Derived>::selfadjointView()
+template<unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView()
 {
-  return SparseSelfAdjointView<Derived, Mode>(derived());
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************

From fd788748889f50536f590b68dfa98db0044e5115 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Mar 2015 21:31:03 +0100
Subject: [PATCH 46/83] Fix compilation of iterative solvers with dense
 matrices

---
 .../ConjugateGradient.h                       |  2 +-
 .../IterativeSolverBase.h                     | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 11b8347f7..9e7dd1404 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -185,7 +185,7 @@ public:
   {
     typedef typename internal::conditional<UpLo==(Lower|Upper),
                                            Ref<const MatrixType>&,
-                                           SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                           typename Ref<const MatrixType>::template ConstSelfAdjointViewReturnType<UpLo>::Type
                                           >::type MatrixWrapperType;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 46bc0ac78..6477b9de2 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -52,9 +52,9 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
-    : mp_matrix(A)
+  template<typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
+    : mp_matrix(A.derived())
   {
     init();
     compute(mp_matrix);
@@ -67,8 +67,8 @@ public:
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
     * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  template<typename SparseMatrixDerived>
-  Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
     m_preconditioner.analyzePattern(mp_matrix);
@@ -87,8 +87,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
     grab(A.derived());
@@ -108,8 +108,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  template<typename SparseMatrixDerived>
-  Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
+  template<typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
     m_preconditioner.compute(mp_matrix);
@@ -223,11 +223,11 @@ protected:
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
   
-  template<typename SparseMatrixDerived>
-  void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &A)
   {
     mp_matrix.~Ref<const MatrixType>();
-    ::new (&mp_matrix) Ref<const MatrixType>(A);
+    ::new (&mp_matrix) Ref<const MatrixType>(A.derived());
   }
   
   void grab(const Ref<const MatrixType> &A)

From f2c3e2b10fbc15fbcd3d5a24def771cbd7549d8c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Thu, 12 Mar 2015 13:16:33 -0700
Subject: [PATCH 47/83] Add --only-cubic-sizes option to analyze-blocking-sizes
 tool

---
 bench/analyze-blocking-sizes.cpp | 81 +++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index 7aae57d93..cf8236a82 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -25,6 +25,9 @@ using namespace std;
 
 const int default_precision = 4;
 
+// see --only-cubic-sizes
+bool only_cubic_sizes = false;
+
 uint8_t log2_pot(size_t x) {
   size_t l = 0;
   while (x >>= 1) l++;
@@ -130,6 +133,9 @@ struct inputfile_t
             cerr << "offending line:" << endl << line << endl;
             exit(1);
           }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
           inputfile_entry_t entry;
           entry.product_size = uint16_t(product_size);
           entry.pot_block_size = uint16_t(block_size);
@@ -155,6 +161,9 @@ struct inputfile_t
             cerr << "offending line:" << endl << line << endl;
             exit(1);
           }
+          if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
+            continue;
+          }
           inputfile_entry_t entry;
           entry.product_size = uint16_t(product_size);
           entry.pot_block_size = 0;
@@ -505,28 +514,23 @@ void print_partition(
 struct action_t
 {
   virtual const char* invokation_name() const { abort(); return nullptr; }
-  virtual void run(int, char*[]) const { abort(); }
+  virtual void run(const vector<string>&) const { abort(); }
   virtual ~action_t() {}
 };
 
 struct partition_action_t : action_t
 {
-  virtual const char* invokation_name() const { return "partition"; }
-  virtual void run(int argc, char *argv[]) const
+  virtual const char* invokation_name() const override { return "partition"; }
+  virtual void run(const vector<string>& input_filenames) const override
   {
     vector<preprocessed_inputfile_t> preprocessed_inputfiles;
 
-    if (!argc) {
+    if (input_filenames.empty()) {
       cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
       exit(1);
     }
 
-    vector<string> inputfilenames;
-    for (int i = 0; i < argc; i++) {
-      inputfilenames.emplace_back(argv[i]);
-    }
-
-    for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) {
+    for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
       inputfile_t inputfile(*it);
       switch (inputfile.type) {
         case inputfile_t::type_t::all_pot_sizes:
@@ -610,7 +614,7 @@ struct evaluate_defaults_action_t : action_t
   static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
     return e1.default_efficiency < e2.default_efficiency;
   }
-  virtual const char* invokation_name() const { return "evaluate-defaults"; }
+  virtual const char* invokation_name() const override { return "evaluate-defaults"; }
   void show_usage_and_exit() const
   {
     cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
@@ -618,13 +622,13 @@ struct evaluate_defaults_action_t : action_t
          << "performance measured over all POT sizes." << endl;
     exit(1);
   }
-  virtual void run(int argc, char *argv[]) const
+  virtual void run(const vector<string>& input_filenames) const override
   {
-    if (argc != 2) {
+    if (input_filenames.size() != 2) {
       show_usage_and_exit();
     }
-    inputfile_t inputfile_default_sizes(argv[0]);
-    inputfile_t inputfile_all_pot_sizes(argv[1]);
+    inputfile_t inputfile_default_sizes(input_filenames[0]);
+    inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
     if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
       cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
       show_usage_and_exit();
@@ -719,7 +723,7 @@ struct evaluate_defaults_action_t : action_t
 void show_usage_and_exit(int argc, char* argv[],
                          const vector<unique_ptr<action_t>>& available_actions)
 {
-  cerr << "usage: " << argv[0] << " <action> <input files...>" << endl;
+  cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
   cerr << "available actions:" << endl;
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
     cerr << "  " << (*it)->invokation_name() << endl;
@@ -737,21 +741,52 @@ int main(int argc, char* argv[])
   available_actions.emplace_back(new partition_action_t);
   available_actions.emplace_back(new evaluate_defaults_action_t);
 
-  auto action = available_actions.end();
+  vector<string> input_filenames;
+
+  action_t* action = nullptr;
 
   if (argc < 2) {
     show_usage_and_exit(argc, argv, available_actions);
   }
-  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
-    if (!strcmp(argv[1], (*it)->invokation_name())) {
-      action = it;
-      break;
+  for (int i = 1; i < argc; i++) {
+    bool arg_handled = false;
+    // Step 1. Try to match action invokation names.
+    for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
+      if (!strcmp(argv[i], (*it)->invokation_name())) {
+        if (!action) {
+          action = it->get();
+          arg_handled = true;
+          break;
+        } else {
+          cerr << "can't specify more than one action!" << endl;
+          show_usage_and_exit(argc, argv, available_actions);
+        }
+      }
     }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 2. Try to match option names.
+    if (argv[i][0] == '-') {
+      if (!strcmp(argv[i], "--only-cubic-sizes")) {
+        only_cubic_sizes = true;
+        arg_handled = true;
+      }
+      if (!arg_handled) {
+        cerr << "Unrecognized option: " << argv[i] << endl;
+        show_usage_and_exit(argc, argv, available_actions);
+      }
+    }
+    if (arg_handled) {
+      continue;
+    }
+    // Step 3. Default to interpreting args as input filenames.
+    input_filenames.emplace_back(argv[i]);
   }
 
-  if (action == available_actions.end()) {
+  if (!action) {
     show_usage_and_exit(argc, argv, available_actions);
   }
 
-  (*action)->run(argc - 2, argv + 2);
+  action->run(input_filenames);
 }

From 2f6f8bf31c781347e48183d3681978eff6ddbede Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 16:24:40 +0100
Subject: [PATCH 48/83] Add missing coeff/coeffRef members to Block<sparse>,
 and extend unit tests.

---
 Eigen/src/SparseCore/SparseBlock.h | 25 ++++++++++
 test/sparse_basic.cpp              | 78 +++++++++++++++++++++++-------
 2 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index acd82e926..2b31716a3 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -49,6 +49,16 @@ public:
       return nnz;
     }
     
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
+    
     inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
     Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
     Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
@@ -204,6 +214,21 @@ public:
     }
     
     bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
+    
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
 
     const Scalar& lastCoeff() const
     {
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index e243964f4..d929e1463 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -30,6 +30,7 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
   double density = (std::max)(8./(rows*cols), 0.01);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
   Scalar eps = 1e-6;
 
   Scalar s1 = internal::random<Scalar>();
@@ -59,32 +60,61 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(m, refMat);
 
       // test InnerIterators and Block expressions
-      for (Index t=0; t<10; ++t)
+      for (int t=0; t<10; ++t)
       {
-        Index j = internal::random<Index>(0,cols-1);
-        Index i = internal::random<Index>(0,rows-1);
-        Index w = internal::random<Index>(1,cols-j-1);
-        Index h = internal::random<Index>(1,rows-i-1);
+        Index j = internal::random<Index>(0,cols-2);
+        Index i = internal::random<Index>(0,rows-2);
+        Index w = internal::random<Index>(1,cols-j);
+        Index h = internal::random<Index>(1,rows-i);
 
-         VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
+        VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
         for(Index c=0; c<w; c++)
         {
           VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
           for(Index r=0; r<h; r++)
           {
-            // FIXME col().coeff() not implemented yet
-//             VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+            VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
           }
         }
-         for(Index r=0; r<h; r++)
-         {
-           VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
-           for(Index c=0; c<w; c++)
-           {
-             // FIXME row().coeff() not implemented yet
-//             VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
-           }
-         }
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
+          for(Index c=0; c<w; c++)
+          {
+            VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
+            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+          }
+        }
+        
+        VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
+        VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
+          VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
+          for(Index c=0; c<w; c++)
+          {
+            VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
+            VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
+            
+            VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
+            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+            if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
+            {
+              VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
+            }
+            if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
+            {
+              VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+            }
+          }
+        }
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
+          VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
+        }
       }
 
       for(Index c=0; c<cols; c++)
@@ -362,6 +392,20 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       refMat2.col(i) = refMat2.col(i) * s1;
       VERIFY_IS_APPROX(m2,refMat2);
     }
+    
+    Index r0 = internal::random<Index>(0,rows-2);
+    Index c0 = internal::random<Index>(0,cols-2);
+    Index r1 = internal::random<Index>(1,rows-r0);
+    Index c1 = internal::random<Index>(1,cols-c0);
+    
+    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
+    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
+    
+    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
+    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
+
+    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
+    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
   }
 
   // test prune

From d73ccd717e000d0a91293db2b24c402e49c907ff Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 13 Mar 2015 10:36:01 -0700
Subject: [PATCH 49/83] Add support for dumping blocking sizes tables

---
 bench/analyze-blocking-sizes.cpp   | 75 +++++++++++++++++++++++++++++-
 bench/benchmark-blocking-sizes.cpp |  4 +-
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index cf8236a82..c133df599 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -28,6 +28,9 @@ const int default_precision = 4;
 // see --only-cubic-sizes
 bool only_cubic_sizes = false;
 
+// see --dump-tables
+bool dump_tables = false;
+
 uint8_t log2_pot(size_t x) {
   size_t l = 0;
   while (x >>= 1) l++;
@@ -318,14 +321,74 @@ float efficiency_of_subset(
         efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
       }
       efficiency = min(efficiency, efficiency_this_product_size);
-      first_entry_index_with_this_product_size = entry_index;
-      product_size = first_file.entries[entry_index].product_size;
+      if (entry_index < num_entries) {
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
     }
   }
 
   return efficiency;
 }
 
+void dump_table_for_subset(
+        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
+        const vector<size_t>& subset)
+{
+  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
+  const size_t num_entries = first_file.entries.size();
+  size_t entry_index = 0;
+  size_t first_entry_index_with_this_product_size = 0;
+  uint16_t product_size = first_file.entries[0].product_size;
+  size_t i = 0;
+  size_triple_t min_product_size(first_file.entries.front().product_size);
+  size_triple_t max_product_size(first_file.entries.back().product_size);
+  if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
+    abort();
+  }
+  if (only_cubic_sizes) {
+    cout << "/* Warning: generated with --only-cubic-sizes ! */" << endl;
+  }
+  cout << "struct optimal_block_sizes_table {" << endl;
+  cout << "  static const size_t min_size = " << min_product_size.k << ";" << endl;
+  cout << "  static const size_t max_size = " << max_product_size.k << ";" << endl;
+  cout << "  static const uint16_t* table() {" << endl;
+  cout << "    static const uint16_t data[] = {";
+  while (entry_index < num_entries) {
+    ++entry_index;
+    if (entry_index == num_entries ||
+        first_file.entries[entry_index].product_size != product_size)
+    {
+      float best_efficiency_this_product_size = 0.0f;
+      uint16_t best_block_size_this_product_size = 0;
+      for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
+        float efficiency_this_entry = 1.0f;
+        for (auto i = subset.begin(); i != subset.end(); ++i) {
+          efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
+        }
+        if (efficiency_this_entry > best_efficiency_this_product_size) {
+          best_efficiency_this_product_size = efficiency_this_entry;
+          best_block_size_this_product_size = first_file.entries[e].block_size;
+        }
+      }
+      if ((i++) % 8) {
+        cout << ", ";
+      } else {
+        cout << endl << "      ";
+      }
+      cout << "0x" << hex << best_block_size_this_product_size << dec;
+      if (entry_index < num_entries) {
+        first_entry_index_with_this_product_size = entry_index;
+        product_size = first_file.entries[entry_index].product_size;
+      }
+    }
+  }
+  cout << endl << "    };" << endl;
+  cout << "    return data;" << endl;
+  cout << "  }" << endl;
+  cout << "};" << endl;
+}
+
 float efficiency_of_partition(
         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
         const vector<vector<size_t>>& partition)
@@ -507,6 +570,10 @@ void print_partition(
     for (auto file = subset->begin(); file != subset->end(); ++file) {
       cout << "    " << preprocessed_inputfiles[*file].filename << endl;
     }
+    if (dump_tables) {
+      cout << "  Table:" << endl;
+      dump_table_for_subset(preprocessed_inputfiles, *subset);
+    }
   }
   cout << endl;
 }
@@ -772,6 +839,10 @@ int main(int argc, char* argv[])
         only_cubic_sizes = true;
         arg_handled = true;
       }
+      if (!strcmp(argv[i], "--dump-tables")) {
+        dump_tables = true;
+        arg_handled = true;
+      }
       if (!arg_handled) {
         cerr << "Unrecognized option: " << argv[i] << endl;
         show_usage_and_exit(argc, argv, available_actions);
diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp
index 33d3824eb..827be2880 100644
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -446,7 +446,7 @@ void try_run_some_benchmarks(
         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
 
         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
-          if (seconds_to_sleep_if_lower_clock_speed > 30) {
+          if (seconds_to_sleep_if_lower_clock_speed > 32) {
             cerr << "Sleeping longer probably won't make a difference." << endl;
             cerr << "Serializing benchmarks to " << session_filename << endl;
             serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
@@ -456,7 +456,7 @@ void try_run_some_benchmarks(
           rerun_last_tests = true;
           cerr << "Sleeping "
                << seconds_to_sleep_if_lower_clock_speed
-               << " s..." << endl;
+               << " s...                                   \r" << endl;
           sleep(seconds_to_sleep_if_lower_clock_speed);
           current_clock_speed = measure_clock_speed();
           seconds_to_sleep_if_lower_clock_speed *= 2;

From 5ffe29cb9f260f35f1ae0f878885a77d42a29d8b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 20:57:33 +0100
Subject: [PATCH 50/83] Bound pre-allocation to the maximal size representable
 by StorageIndex and throw bad_alloc if that's not possible.

---
 Eigen/src/SparseCore/CompressedStorage.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 49fd46658..52c7da297 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -86,7 +86,12 @@ class CompressedStorage
     void resize(Index size, double reserveSizeFactor = 0)
     {
       if (m_allocatedSize<size)
-        reallocate(size + Index(reserveSizeFactor*double(size)));
+      {
+        Index realloc_size = (std::min<Index>)(NumTraits<StorageIndex>::highest(),  size + Index(reserveSizeFactor*double(size)));
+        if(realloc_size<size)
+          internal::throw_std_bad_alloc();
+        reallocate(realloc_size);
+      }
       m_size = size;
     }
 

From a9df28c95b42401fe4ac86313b85872200d05e4c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 21:00:21 +0100
Subject: [PATCH 51/83] SparseMatrix::insert: switch to a fully uncompressed
 mode if sequential insertion is not possible (otherwise an arbitrary large
 amount of memory was preallocated in some cases)

---
 Eigen/src/SparseCore/SparseMatrix.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 0ba7e111a..4c3a47959 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1172,8 +1172,12 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
     return (m_data.value(p) = 0);
   }
   
-  // make sure the matrix is compatible to random un-compressed insertion:
-  m_data.resize(m_data.allocatedSize());
+  if(m_data.size() != m_data.allocatedSize())
+  {
+    // make sure the matrix is compatible to random un-compressed insertion:
+    m_data.resize(m_data.allocatedSize());
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(2*m_outerSize, convert_index(m_outerSize)));
+  }
   
   return insertUncompressed(row,col);
 }

From 8580eb6808428a53d5fb91be23fb5c6c8c9e9463 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 21:06:20 +0100
Subject: [PATCH 52/83] bug #949: add static assertion for incompatible scalar
 types in dense end-user decompositions.

---
 Eigen/src/Cholesky/LDLT.h                      |  7 +++++++
 Eigen/src/Cholesky/LLT.h                       |  8 ++++++++
 Eigen/src/Eigenvalues/ComplexEigenSolver.h     |  8 ++++++++
 Eigen/src/Eigenvalues/EigenSolver.h            |  9 +++++++++
 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h |  9 +++++++++
 Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h |  7 +++++++
 Eigen/src/LU/FullPivLU.h                       |  8 ++++++++
 Eigen/src/LU/PartialPivLU.h                    |  8 ++++++++
 Eigen/src/QR/ColPivHouseholderQR.h             |  8 ++++++++
 Eigen/src/QR/FullPivHouseholderQR.h            |  8 ++++++++
 Eigen/src/QR/HouseholderQR.h                   |  8 ++++++++
 Eigen/src/SVD/SVDBase.h                        | 10 +++++++++-
 failtest/CMakeLists.txt                        | 12 ++++++++++++
 failtest/bdcsvd_int.cpp                        | 14 ++++++++++++++
 failtest/colpivqr_int.cpp                      | 14 ++++++++++++++
 failtest/eigensolver_cplx.cpp                  | 14 ++++++++++++++
 failtest/eigensolver_int.cpp                   | 14 ++++++++++++++
 failtest/fullpivlu_int.cpp                     | 14 ++++++++++++++
 failtest/fullpivqr_int.cpp                     | 14 ++++++++++++++
 failtest/jacobisvd_int.cpp                     | 14 ++++++++++++++
 failtest/ldlt_int.cpp                          | 14 ++++++++++++++
 failtest/llt_int.cpp                           | 14 ++++++++++++++
 failtest/partialpivlu_int.cpp                  | 14 ++++++++++++++
 failtest/qr_int.cpp                            | 14 ++++++++++++++
 24 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 failtest/bdcsvd_int.cpp
 create mode 100644 failtest/colpivqr_int.cpp
 create mode 100644 failtest/eigensolver_cplx.cpp
 create mode 100644 failtest/eigensolver_int.cpp
 create mode 100644 failtest/fullpivlu_int.cpp
 create mode 100644 failtest/fullpivqr_int.cpp
 create mode 100644 failtest/jacobisvd_int.cpp
 create mode 100644 failtest/ldlt_int.cpp
 create mode 100644 failtest/llt_int.cpp
 create mode 100644 failtest/partialpivlu_int.cpp
 create mode 100644 failtest/qr_int.cpp

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index f46f7b758..93a726483 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -226,6 +226,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
 
     /** \internal
       * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -424,6 +429,8 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 629c87161..745b74d95 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -170,6 +170,12 @@ template<typename _MatrixType, int _UpLo> class LLT
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
@@ -377,6 +383,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 template<typename MatrixType, int _UpLo>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 075a62848..6b010c312 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -234,6 +234,12 @@ template<typename _MatrixType> class ComplexEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     EigenvectorType m_eivec;
     EigenvalueType m_eivalues;
     ComplexSchur<MatrixType> m_schur;
@@ -251,6 +257,8 @@ template<typename MatrixType>
 ComplexEigenSolver<MatrixType>& 
 ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   // this code is inspired from Jampack
   eigen_assert(matrix.cols() == matrix.rows());
 
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index a63a42341..167cd99ab 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -299,6 +299,13 @@ template<typename _MatrixType> class EigenSolver
     void doComputeEigenvectors();
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     EigenvalueType m_eivalues;
     bool m_isInitialized;
@@ -366,6 +373,8 @@ template<typename MatrixType>
 EigenSolver<MatrixType>& 
 EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   using numext::isfinite;
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index c9da6740a..e2e28cd4a 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -263,6 +263,13 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
@@ -290,6 +297,8 @@ template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 66d1154cf..1dcfacf0b 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -347,6 +347,11 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
@@ -382,6 +387,8 @@ EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 ::compute(const MatrixType& matrix, int options)
 {
+  check_template_parameters();
+  
   using std::abs;
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index d1a260a37..75dbc16b0 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -390,6 +390,12 @@ template<typename _MatrixType> class FullPivLU
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
@@ -434,6 +440,8 @@ FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
 template<typename MatrixType>
 FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   // the permutations are stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
   
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 3d8825a97..019fc4230 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -209,6 +209,12 @@ template<typename _MatrixType> class PartialPivLU
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
@@ -425,6 +431,8 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
 template<typename MatrixType>
 PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   // the row permutation is stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<NumTraits<int>::highest());
   
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 03ff0a8f2..7b3842cbe 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -398,6 +398,12 @@ template<typename _MatrixType> class ColPivHouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     PermutationType m_colsPermutation;
@@ -436,6 +442,8 @@ typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDetermina
 template<typename MatrixType>
 ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   using std::abs;
   Index rows = matrix.rows();
   Index cols = matrix.cols();
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index 4952fbb46..4c2c958a8 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -380,6 +380,12 @@ template<typename _MatrixType> class FullPivHouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -419,6 +425,8 @@ typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDetermin
 template<typename MatrixType>
 FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   using std::abs;
   Index rows = matrix.rows();
   Index cols = matrix.cols();
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 195bacb85..878654be5 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -196,6 +196,12 @@ template<typename _MatrixType> class HouseholderQR
     #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -348,6 +354,8 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
 template<typename MatrixType>
 HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
 {
+  check_template_parameters();
+  
   Index rows = matrix.rows();
   Index cols = matrix.cols();
   Index size = (std::min)(rows,cols);
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index 8903755e7..b89393721 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -217,6 +217,12 @@ public:
   #endif
 
 protected:
+  
+  static void check_template_parameters()
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+  
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
@@ -240,7 +246,9 @@ protected:
       m_usePrescribedThreshold(false),
       m_computationOptions(0),
       m_rows(-1), m_cols(-1), m_diagSize(0)
-  {}
+  {
+    check_template_parameters();
+  }
 
 
 };
diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
index c8795a344..d3e82ecd9 100644
--- a/failtest/CMakeLists.txt
+++ b/failtest/CMakeLists.txt
@@ -47,6 +47,18 @@ ei_add_failtest("sparse_ref_3")
 ei_add_failtest("sparse_ref_4")
 ei_add_failtest("sparse_ref_5")
 
+ei_add_failtest("partialpivlu_int")
+ei_add_failtest("fullpivlu_int")
+ei_add_failtest("llt_int")
+ei_add_failtest("ldlt_int")
+ei_add_failtest("qr_int")
+ei_add_failtest("colpivqr_int")
+ei_add_failtest("fullpivqr_int")
+ei_add_failtest("jacobisvd_int")
+ei_add_failtest("bdcsvd_int")
+ei_add_failtest("eigensolver_int")
+ei_add_failtest("eigensolver_cplx")
+
 if (EIGEN_FAILTEST_FAILURE_COUNT)
   message(FATAL_ERROR
           "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. "
diff --git a/failtest/bdcsvd_int.cpp b/failtest/bdcsvd_int.cpp
new file mode 100644
index 000000000..670752cf5
--- /dev/null
+++ b/failtest/bdcsvd_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  BDCSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/colpivqr_int.cpp b/failtest/colpivqr_int.cpp
new file mode 100644
index 000000000..db11910d4
--- /dev/null
+++ b/failtest/colpivqr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  ColPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/eigensolver_cplx.cpp b/failtest/eigensolver_cplx.cpp
new file mode 100644
index 000000000..c2e21e189
--- /dev/null
+++ b/failtest/eigensolver_cplx.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR std::complex<double>
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/eigensolver_int.cpp b/failtest/eigensolver_int.cpp
new file mode 100644
index 000000000..eda8dc20b
--- /dev/null
+++ b/failtest/eigensolver_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Eigenvalues"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  EigenSolver<Matrix<SCALAR,Dynamic,Dynamic> > eig(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/fullpivlu_int.cpp b/failtest/fullpivlu_int.cpp
new file mode 100644
index 000000000..e9d2c6eb3
--- /dev/null
+++ b/failtest/fullpivlu_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/fullpivqr_int.cpp b/failtest/fullpivqr_int.cpp
new file mode 100644
index 000000000..d182a7b6b
--- /dev/null
+++ b/failtest/fullpivqr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  FullPivHouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/jacobisvd_int.cpp b/failtest/jacobisvd_int.cpp
new file mode 100644
index 000000000..12790aef1
--- /dev/null
+++ b/failtest/jacobisvd_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/SVD"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  JacobiSVD<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/ldlt_int.cpp b/failtest/ldlt_int.cpp
new file mode 100644
index 000000000..243e45746
--- /dev/null
+++ b/failtest/ldlt_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LDLT<Matrix<SCALAR,Dynamic,Dynamic> > ldlt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/llt_int.cpp b/failtest/llt_int.cpp
new file mode 100644
index 000000000..cb020650d
--- /dev/null
+++ b/failtest/llt_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Cholesky"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  LLT<Matrix<SCALAR,Dynamic,Dynamic> > llt(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/partialpivlu_int.cpp b/failtest/partialpivlu_int.cpp
new file mode 100644
index 000000000..98ef282ea
--- /dev/null
+++ b/failtest/partialpivlu_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/LU"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  PartialPivLU<Matrix<SCALAR,Dynamic,Dynamic> > lu(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}
diff --git a/failtest/qr_int.cpp b/failtest/qr_int.cpp
new file mode 100644
index 000000000..ce200e818
--- /dev/null
+++ b/failtest/qr_int.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/QR"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define SCALAR int
+#else
+#define SCALAR float
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  HouseholderQR<Matrix<SCALAR,Dynamic,Dynamic> > qr(Matrix<SCALAR,Dynamic,Dynamic>::Random(10,10));
+}

From d99ab35f9e886a014be6d47606d232af1e668f76 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 21:12:46 +0100
Subject: [PATCH 53/83] Fix internal::random(x,y) for integer types. The
 previous implementation could return y+1. The new implementation uses
 rejection sampling to get an unbiased behabior.

---
 Eigen/src/Core/MathFunctions.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 878f38e92..3c76a58b9 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -525,8 +525,25 @@ struct random_default_impl<Scalar, false, true>
   typedef typename NumTraits<Scalar>::NonInteger NonInteger;
 
   static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  { 
+    using std::max;
+    Scalar range = (max)(Scalar(0),Scalar(y-x));
+    Scalar offset = 0;
+    if(range<=RAND_MAX)
+    {
+      // rejection sampling
+      int divisor = RAND_MAX/(range+1);
+
+      do {
+        offset = Scalar(std::rand() / divisor);
+      } while (offset > range);
+    }
+    else
+    {
+      offset = std::rand() * range;
+    }
+    
+    return x + offset;
   }
 
   static inline Scalar run()

From 1330f8bbd12306cc4955d943f27e5281d413bed4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 13 Mar 2015 21:15:50 +0100
Subject: [PATCH 54/83] bug #973, improve AVX support by enabling vectorization
 of Vector4i-like types, and enforcing alignement of Vector4f/Vector2d-like
 types to preserve compatibility with SSE and future Eigen versions that will
 vectorize them with AVX enabled.

---
 Eigen/src/Core/CoreEvaluators.h | 16 ++++--
 Eigen/src/Core/DenseStorage.h   | 88 +++++++++++++++++++++++++++++++--
 Eigen/src/Core/util/Macros.h    |  3 ++
 Eigen/src/Core/util/XprHelper.h |  5 +-
 test/unalignedassert.cpp        |  7 +--
 test/vectorization_logic.cpp    |  2 +-
 6 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 9485080d3..85f46cb8d 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -647,11 +647,15 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
     IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
+    
     KeepsPacketAccess = bool(HasNoInnerStride)
                         && ( bool(IsDynamicSize)
                            || HasNoOuterStride
                            || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ),
     Flags0 = evaluator<PlainObjectType>::Flags,
     Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
     Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
@@ -717,7 +721,10 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
                        && (InnerStrideAtCompileTime == 1)
                         ? PacketAccessBit : 0,
     
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
+    
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0,
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
     Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
@@ -825,12 +832,15 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
                       typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
+    // TODO: should check for smaller packet types once we can handle multi-sized packet types
+    const int AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar);
     // FIXME this should be an internal assertion
-    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
+    eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned");
   }
 };
 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 9186f59a7..522aaa299 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -34,14 +34,35 @@ void check_static_allocation_size()
   #endif
 }
 
+template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
+         bool Match     = bool((Size%unpacket_traits<Packet>::size)==0),
+         bool TryHalf   =  bool(unpacket_traits<Packet>::size > Size)
+                        && bool(unpacket_traits<Packet>::size > unpacket_traits<typename unpacket_traits<Packet>::half>::size) >
+struct compute_default_alignment
+{
+  enum { value = 0 };
+};
+
+template<typename T, int Size, typename Packet>
+struct compute_default_alignment<T, Size, Packet, true, false> // Match
+{
+  enum { value = sizeof(T) * unpacket_traits<Packet>::size };
+};
+
+template<typename T, int Size, typename Packet>
+struct compute_default_alignment<T, Size, Packet, false, true>
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value };
+};
+
 /** \internal
   * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
   * to 16 bytes boundary if the total size is a multiple of 16 bytes.
   */
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
   T array[Size];
@@ -81,14 +102,71 @@ struct plain_array
 #endif
 
 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
 {
-  EIGEN_USER_ALIGN_DEFAULT T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
   EIGEN_DEVICE_FUNC
   plain_array() 
   { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
     check_static_allocation_size<T,Size>();
   }
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index aaea9f035..6b294e77f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -318,6 +318,9 @@
 // Defined the boundary (in bytes) on which the data needs to be aligned. Note
 // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
 // aligned at all regardless of the value of this #define.
+// TODO should be renamed EIGEN_MAXIMAL_ALIGN_BYTES,
+//      for instance with AVX 1 EIGEN_MAXIMAL_ALIGN_BYTES=32 while for 'int' 16 bytes alignment is always enough,
+//      and 16 bytes alignment is also enough for Vector4f.
 #define EIGEN_ALIGN_BYTES 16
 
 #ifdef EIGEN_DONT_ALIGN
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 528ebe297..562f425bd 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -159,13 +159,16 @@ class compute_matrix_evaluator_flags
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+      
+      // TODO: should check for smaller packet types once we can handle multi-sized packet types
+      align_bytes = int(packet_traits<Scalar>::size) * sizeof(Scalar),
 
       aligned_bit =
       (
             ((Options&DontAlign)==0)
         && (
 #if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0))
 #else
              0
 #endif
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index d8815263a..6f7b72167 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -81,7 +81,7 @@ void construct_at_boundary(int boundary)
 
 void unalignedassert()
 {
-  #if EIGEN_ALIGN_STATICALLY
+#if EIGEN_ALIGN_STATICALLY
   construct_at_boundary<Vector2f>(4);
   construct_at_boundary<Vector3f>(4);
   construct_at_boundary<Vector4f>(16);
@@ -100,7 +100,7 @@ void unalignedassert()
   construct_at_boundary<Vector3cf>(4);
   construct_at_boundary<Vector2cd>(EIGEN_ALIGN_BYTES);
   construct_at_boundary<Vector3cd>(16);
-  #endif
+#endif
 
   check_unalignedassert_good<TestNew1>();
   check_unalignedassert_good<TestNew2>();
@@ -112,11 +112,12 @@ void unalignedassert()
   check_unalignedassert_good<Depends<true> >();
 
 #if EIGEN_ALIGN_STATICALLY
-  if(EIGEN_ALIGN_BYTES==16)
+  if(EIGEN_ALIGN_BYTES>=16)
   {
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
+    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
   }
   for(int b=8; b<EIGEN_ALIGN_BYTES; b+=8)
   {
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 2f839cf51..97477072a 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -214,7 +214,7 @@ template<typename Scalar, bool Enable = internal::packet_traits<Scalar>::Vectori
             >(DefaultTraversal,CompleteUnrolling)));
 
     VERIFY((test_assign(Matrix11(), Matrix<Scalar,PacketSize,EIGEN_PLAIN_ENUM_MIN(2,PacketSize)>()*Matrix<Scalar,EIGEN_PLAIN_ENUM_MIN(2,PacketSize),PacketSize>(),
-                        PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling)));
+                        InnerVectorizedTraversal, CompleteUnrolling)));
     #endif
 
     VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3),

From 488c15615a31fcee310200d726d8e58bb005bc87 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 13 Mar 2015 14:51:26 -0700
Subject: [PATCH 55/83] organize a little our default cache sizes, and use a
 saner default L1 outside of x86 (10% faster on Nexus 5)

---
 .../Core/products/GeneralBlockPanelKernel.h   | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 408281c82..1b47f1a6d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,21 +25,31 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
   static bool m_cache_sizes_initialized = false;
-  static std::ptrdiff_t m_l1CacheSize = 32*1024;
-  static std::ptrdiff_t m_l2CacheSize = 256*1024;
-  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  static std::ptrdiff_t m_l3CacheSize = 0;
 
   if(!m_cache_sizes_initialized)
   {
     int l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
-    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
-    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
-    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
     m_cache_sizes_initialized = true;
   }
 
@@ -974,7 +984,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
       // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
       // However, if depth is too small, we can extend the number of rows of these horizontal panels.
       // This actual number of rows is computed as follow:
-      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
       const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
 #else
@@ -1211,7 +1221,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
-      const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function.
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
       Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
 #else

From b6b88c08082dcfc5dd81c6997d6090507267cc13 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 13 Mar 2015 14:57:05 -0700
Subject: [PATCH 56/83] update perf_monitoring/gemm/changesets.txt

---
 bench/perf_monitoring/gemm/changesets.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index a5b63bc89..40a71c781 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -41,3 +41,5 @@ before-evaluators
 6981:7e5d6f78da59   # dynamic loop swapping
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
+7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+

From e56aabf205a1e8f581dd8a46d7d46ce79c45e158 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:05:12 -0400
Subject: [PATCH 57/83] Refactor computeProductBlockingSizes to make room for
 the possibility of using lookup tables

---
 Eigen/Core                                    |   1 +
 Eigen/src/Core/MathFunctions.h                |  36 +++----
 .../Core/products/GeneralBlockPanelKernel.h   | 100 ++++++++++++------
 .../Core/products/LookupBlockingSizesTable.h  |  89 ++++++++++++++++
 Eigen/src/Core/util/ForwardDeclarations.h     |   8 ++
 5 files changed, 182 insertions(+), 52 deletions(-)
 create mode 100644 Eigen/src/Core/products/LookupBlockingSizesTable.h

diff --git a/Eigen/Core b/Eigen/Core
index 0b8eaa61c..138c34916 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -381,6 +381,7 @@ using std::ptrdiff_t;
 #include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
+#include "src/Core/products/LookupBlockingSizesTable.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
 #include "src/Core/ProductEvaluators.h"
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 3c76a58b9..0fde5c71e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -473,48 +473,48 @@ struct random_default_impl<Scalar, false, false>
 };
 
 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
   enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
   };
 };
 
 template<unsigned int n,
          int lower = 0,
          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
   enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
   // no value, error at compile time
 };
@@ -551,7 +551,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
     return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
            scalar_bits = sizeof(Scalar) * CHAR_BIT,
            shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
            offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 1b47f1a6d..617439ff6 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -74,45 +74,23 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
   }
 }
 
-/** \brief Computes the blocking parameters for a m x k times k x n matrix product
-  *
-  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
-  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
-  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
-  *
-  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
-  * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
-  *
-  * \sa setCpuCacheSizes */
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
 
 template<typename LhsScalar, typename RhsScalar, int KcFactor>
-void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-    EIGEN_UNUSED_VARIABLE(num_threads);
-    enum {
-      kr = 8,
-      mr = Traits::mr,
-      nr = Traits::nr
-    };
-    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-    if (k > kr) k -= k % kr;
-    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-    if (m > mr) m -= m % mr;
-    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
-    if (n > nr) n -= n % nr;
-    return;
-  }
-#endif
-
   // Explanations:
   // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
   // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
@@ -281,6 +259,60 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   }
 }
 
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+  return false;
+#endif
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using a precomputed lookup table;
+  *   - or using fixed prescribed values (for testing purposes).
+  *
+  * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    if (!lookupBlockingSizesFromTable<LhsScalar, RhsScalar>(k, m, n, num_threads)) {
+      evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+    }
+  }
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
+}
+
 template<typename LhsScalar, typename RhsScalar>
 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
new file mode 100644
index 000000000..85aeedec8
--- /dev/null
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename LhsScalar,
+          typename RhsScalar,
+          bool HasLookupTable = BlockingSizesLookupTable<LhsScalar, RhsScalar>::NumSizes != 0 >
+struct LookupBlockingSizesFromTableImpl
+{
+  static bool run(Index&, Index&, Index&, Index)
+  {
+    return false;
+  }
+};
+
+inline uint8_t floor_log2_helper(uint16_t& x, size_t offset)
+{
+  uint16_t y = x >> offset;
+  if (y) {
+    x = y;
+    return offset;
+  } else {
+    return 0;
+  }
+}
+
+inline uint8_t floor_log2(uint16_t x)
+{
+  return floor_log2_helper(x, 8)
+       + floor_log2_helper(x, 4)
+       + floor_log2_helper(x, 2)
+       + floor_log2_helper(x, 1);
+}
+
+inline uint8_t ceil_log2(uint16_t x)
+{
+  return x > 1 ? floor_log2(x - 1) + 1 : 0;
+}
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
+{
+  static bool run(Index& k, Index& m, Index& n, Index)
+  {
+    using std::min;
+    using std::max;
+    typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
+    const uint16_t minsize = Table::BaseSize;
+    const uint16_t maxsize = minsize << (Table::NumSizes + 1);
+    const uint16_t k_clamped = max<uint16_t>(minsize, min<Index>(k, maxsize));
+    const uint16_t m_clamped = max<uint16_t>(minsize, min<Index>(m, maxsize));
+    const uint16_t n_clamped = max<uint16_t>(minsize, min<Index>(n, maxsize));
+    const size_t k_index = ceil_log2(k_clamped / minsize);
+    const size_t m_index = ceil_log2(m_clamped / minsize);
+    const size_t n_index = ceil_log2(n_clamped / minsize);
+    const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
+    const uint16_t table_entry = Table::Data()[index];
+    k = min(k, 1 << ((table_entry & 0xf00) >> 8));
+    m = min(m, 1 << ((table_entry & 0x0f0) >> 4));
+    n = min(n, 1 << ((table_entry & 0x00f) >> 0));
+    return true;
+  }
+};
+
+template <typename LhsScalar,
+          typename RhsScalar>
+bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
+{
+  return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
+}
+
+}
+
+}
+
+#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index c23892c50..8034f9b5e 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -287,6 +287,14 @@ struct stem_function
   typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct BlockingSizesLookupTable
+{
+  static const size_t NumSizes = 0;
+};
+
 }
 
 } // end namespace Eigen

From ca5c12587b6e51be7f401c2878800d5d49f615d8 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:05:53 -0400
Subject: [PATCH 58/83] Polish lookup tables generation

---
 bench/analyze-blocking-sizes.cpp | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index c133df599..3d57f4bb3 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -347,13 +347,16 @@ void dump_table_for_subset(
     abort();
   }
   if (only_cubic_sizes) {
-    cout << "/* Warning: generated with --only-cubic-sizes ! */" << endl;
+    cerr << "Can't generate tables with --only-cubic-sizes." << endl;
+    abort();
   }
-  cout << "struct optimal_block_sizes_table {" << endl;
-  cout << "  static const size_t min_size = " << min_product_size.k << ";" << endl;
-  cout << "  static const size_t max_size = " << max_product_size.k << ";" << endl;
-  cout << "  static const uint16_t* table() {" << endl;
-  cout << "    static const uint16_t data[] = {";
+  cout << "struct LookupTable {" << endl;
+  cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
+  const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
+  const size_t TableSize = NumSizes * NumSizes * NumSizes;
+  cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
+  cout << "  static const uint16_t* Data() {" << endl;
+  cout << "    static const uint16_t data[" << TableSize << "] = {";
   while (entry_index < num_entries) {
     ++entry_index;
     if (entry_index == num_entries ||
@@ -371,18 +374,23 @@ void dump_table_for_subset(
           best_block_size_this_product_size = first_file.entries[e].block_size;
         }
       }
-      if ((i++) % 8) {
-        cout << ", ";
+      if ((i++) % NumSizes) {
+        cout << " ";
       } else {
         cout << endl << "      ";
       }
       cout << "0x" << hex << best_block_size_this_product_size << dec;
       if (entry_index < num_entries) {
+        cout << ",";
         first_entry_index_with_this_product_size = entry_index;
         product_size = first_file.entries[entry_index].product_size;
       }
     }
   }
+  if (i != TableSize) {
+    cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
+    abort();
+  }
   cout << endl << "    };" << endl;
   cout << "    return data;" << endl;
   cout << "  }" << endl;
@@ -855,6 +863,11 @@ int main(int argc, char* argv[])
     input_filenames.emplace_back(argv[i]);
   }
 
+  if (dump_tables && only_cubic_sizes) {
+    cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
+    show_usage_and_exit(argc, argv, available_actions);
+  }
+
   if (!action) {
     show_usage_and_exit(argc, argv, available_actions);
   }

From 1dd3d89818cbdf0843de3c0b499eedc433af46f8 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:07:19 -0400
Subject: [PATCH 59/83] Fix a unused-var warning

---
 Eigen/src/Core/CoreEvaluators.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 85f46cb8d..ce00566a5 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -839,6 +839,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   {
     // TODO: should check for smaller packet types once we can handle multi-sized packet types
     const int AlignBytes = int(packet_traits<Scalar>::size) * sizeof(Scalar);
+    EIGEN_ONLY_USED_FOR_DEBUG(AlignBytes)
     // FIXME this should be an internal assertion
     eigen_assert(EIGEN_IMPLIES(evaluator<XprType>::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned");
   }

From 3589a9c115a892ea3ca5dac74d71a1526764cb38 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:12:18 -0400
Subject: [PATCH 60/83] actual_panel_rows computation should always be
 resilient to parameters not consistent with the known L1 cache size, see
 comment

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 617439ff6..6aadfea83 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1017,11 +1017,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
       // However, if depth is too small, we can extend the number of rows of these horizontal panels.
       // This actual number of rows is computed as follow:
       const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
       const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
-#else
-      const Index actual_panel_rows = (3*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) );
-#endif
       for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
         const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);

From 02babb9c0f0f8729a411b1bf7c27240f64055623 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:13:12 -0400
Subject: [PATCH 61/83] Provide a empirical lookup table for blocking sizes
 measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.

---
 Eigen/Core                                    |   1 +
 .../arch/NEON/BlockingSizesLookupTables.h     | 110 ++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h

diff --git a/Eigen/Core b/Eigen/Core
index 138c34916..1a3249604 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -308,6 +308,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/PacketMath.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+  #include "src/Core/arch/NEON/BlockingSizesLookupTables.h"
 #endif
 
 #if defined EIGEN_VECTORIZE_CUDA
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
new file mode 100644
index 000000000..c2366a347
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
+
+namespace Eigen {
+namespace internal {
+
+/* The following lookup table was generated from measurements on a Nexus 5,
+ * which has a Qualcomm Krait 400 CPU. This is very representative of current
+ * 32bit (ARMv7) Android devices. On the other hand, I don't know how
+ * representative that is outside of these conditions. Accordingly,
+ * let's only use this lookup table on ARM 32bit on Android for now.
+ *
+ * Measurements were single-threaded, with Scalar=float, compiled with
+ * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
+ *
+ * The device was cooled, allowing it to run a the max clock speed throughout.
+ * This may not be representative of real-world thermal conditions.
+ *
+ * The benchmark attempted to flush caches to test cold-cache performance.
+ */
+#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
+template<>
+struct BlockingSizesLookupTable<float, float> {
+  static const size_t BaseSize = 16;
+  static const size_t NumSizes = 8;
+  static const uint16_t* Data() {
+    static const uint16_t data[512] = {
+      0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
+      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x457,
+      0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
+      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x477, 0x478,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x476, 0x476, 0x476,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x496,
+      0x474, 0x475, 0x476, 0x4a6, 0x4a5, 0x4a6, 0x4a5, 0x4a6,
+      0x474, 0x475, 0x466, 0x4a6, 0x4a6, 0x4a6, 0x496, 0x4a6,
+      0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
+      0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
+      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
+      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x597, 0x596,
+      0x564, 0x565, 0x566, 0x5a5, 0x5a6, 0x5a6, 0x596, 0x596,
+      0x564, 0x565, 0x566, 0x596, 0x5a6, 0x596, 0x5a6, 0x5a6,
+      0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
+      0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
+      0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
+      0x674, 0x665, 0x666, 0x677, 0x678, 0x679, 0x67a, 0x675,
+      0x684, 0x685, 0x686, 0x687, 0x678, 0x688, 0x687, 0x686,
+      0x664, 0x665, 0x666, 0x657, 0x697, 0x696, 0x696, 0x697,
+      0x664, 0x655, 0x686, 0x696, 0x685, 0x6a6, 0x686, 0x686,
+      0x684, 0x675, 0x686, 0x685, 0x686, 0x696, 0x696, 0x696,
+      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x746,
+      0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
+      0x764, 0x755, 0x756, 0x747, 0x768, 0x759, 0x75a, 0x767,
+      0x744, 0x765, 0x766, 0x767, 0x768, 0x759, 0x778, 0x777,
+      0x744, 0x745, 0x766, 0x767, 0x788, 0x788, 0x786, 0x788,
+      0x754, 0x755, 0x766, 0x787, 0x787, 0x796, 0x787, 0x797,
+      0x684, 0x695, 0x696, 0x6a5, 0x786, 0x786, 0x795, 0x796,
+      0x684, 0x695, 0x686, 0x6a6, 0x786, 0x796, 0x786, 0x796,
+      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x847, 0x848,
+      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x857, 0x858,
+      0x844, 0x865, 0x846, 0x847, 0x868, 0x849, 0x866, 0x867,
+      0x844, 0x865, 0x846, 0x847, 0x878, 0x849, 0x877, 0x877,
+      0x844, 0x845, 0x846, 0x867, 0x885, 0x887, 0x885, 0x887,
+      0x784, 0x785, 0x786, 0x877, 0x885, 0x885, 0x896, 0x896,
+      0x684, 0x695, 0x696, 0x885, 0x896, 0x885, 0x895, 0x895,
+      0x694, 0x685, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
+      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x848, 0x848,
+      0x944, 0x855, 0x756, 0x947, 0x858, 0x857, 0x858, 0x858,
+      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x866, 0x867,
+      0x944, 0x775, 0x976, 0x877, 0x877, 0x878, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x886, 0x986, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x885, 0x897, 0x896, 0x897,
+      0x684, 0x695, 0x6a4, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x6a4, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x844, 0x845, 0x846, 0x847, 0x847, 0x847, 0x847, 0x847,
+      0x854, 0x855, 0x856, 0x857, 0x857, 0x858, 0x857, 0x857,
+      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x867, 0x867,
+      0x774, 0x775, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
+      0x684, 0x6a5, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
+      0x694, 0x6a5, 0x6a5, 0x886, 0xa68, 0x896, 0x896, 0x896,
+      0x844, 0x845, 0x846, 0x846, 0x847, 0x945, 0x847, 0x946,
+      0x854, 0xb55, 0x856, 0x857, 0x857, 0x856, 0x857, 0x857,
+      0x864, 0x865, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+      0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x885, 0x886, 0x886, 0x886, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x786, 0x886, 0x897, 0x897, 0x897,
+      0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
+      0x684, 0x6a5, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
+    };
+    return data;
+  }
+};
+#endif
+
+}
+}
+
+#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H

From 151b8b95c642dea388c2f7da475e4db432bc6962 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 19:10:51 -0400
Subject: [PATCH 62/83] Fix bug in case where EIGEN_TEST_SPECIFIC_BLOCKING_SIZE
 is defined but false

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 6aadfea83..bc24a17ac 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -272,8 +272,8 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
   EIGEN_UNUSED_VARIABLE(k)
   EIGEN_UNUSED_VARIABLE(m)
   EIGEN_UNUSED_VARIABLE(n)
-  return false;
 #endif
+  return false;
 }
 
 /** \brief Computes the blocking parameters for a m x k times k x n matrix product

From e274607d7fcb021d0578e2fc95023eb7b5fab133 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 16 Mar 2015 10:48:27 -0400
Subject: [PATCH 63/83] fix compilation with GCC 4.8

---
 Eigen/src/Core/products/LookupBlockingSizesTable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
index 85aeedec8..f955253f8 100644
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -68,9 +68,9 @@ struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
     const size_t n_index = ceil_log2(n_clamped / minsize);
     const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
     const uint16_t table_entry = Table::Data()[index];
-    k = min(k, 1 << ((table_entry & 0xf00) >> 8));
-    m = min(m, 1 << ((table_entry & 0x0f0) >> 4));
-    n = min(n, 1 << ((table_entry & 0x00f) >> 0));
+    k = min<Index>(k, 1 << ((table_entry & 0xf00) >> 8));
+    m = min<Index>(m, 1 << ((table_entry & 0x0f0) >> 4));
+    n = min<Index>(n, 1 << ((table_entry & 0x00f) >> 0));
     return true;
   }
 };

From 35c3a8bb84778a81b2f90fdea10eadeae16863aa Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 16 Mar 2015 11:05:51 -0400
Subject: [PATCH 64/83] Update Nexus 5 lookup table from combining now 2 runs
 of the benchmark, using the analyze-blocking-sizes partition tool. Gives
 better worst-case performance.

---
 .../arch/NEON/BlockingSizesLookupTables.h     | 98 +++++++++----------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
index c2366a347..7905bdf83 100644
--- a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -35,69 +35,69 @@ struct BlockingSizesLookupTable<float, float> {
   static const uint16_t* Data() {
     static const uint16_t data[512] = {
       0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
-      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x457,
+      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
       0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
-      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x477, 0x478,
-      0x474, 0x475, 0x476, 0x477, 0x478, 0x476, 0x476, 0x476,
-      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x496,
-      0x474, 0x475, 0x476, 0x4a6, 0x4a5, 0x4a6, 0x4a5, 0x4a6,
-      0x474, 0x475, 0x466, 0x4a6, 0x4a6, 0x4a6, 0x496, 0x4a6,
+      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
+      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
+      0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
+      0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
       0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
       0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
       0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
       0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
       0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
-      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x597, 0x596,
-      0x564, 0x565, 0x566, 0x5a5, 0x5a6, 0x5a6, 0x596, 0x596,
-      0x564, 0x565, 0x566, 0x596, 0x5a6, 0x596, 0x5a6, 0x5a6,
+      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
+      0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
+      0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
       0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
       0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
       0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
-      0x674, 0x665, 0x666, 0x677, 0x678, 0x679, 0x67a, 0x675,
-      0x684, 0x685, 0x686, 0x687, 0x678, 0x688, 0x687, 0x686,
-      0x664, 0x665, 0x666, 0x657, 0x697, 0x696, 0x696, 0x697,
-      0x664, 0x655, 0x686, 0x696, 0x685, 0x6a6, 0x686, 0x686,
-      0x684, 0x675, 0x686, 0x685, 0x686, 0x696, 0x696, 0x696,
-      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x746,
+      0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
+      0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
+      0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
+      0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
+      0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
+      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
       0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
-      0x764, 0x755, 0x756, 0x747, 0x768, 0x759, 0x75a, 0x767,
-      0x744, 0x765, 0x766, 0x767, 0x768, 0x759, 0x778, 0x777,
-      0x744, 0x745, 0x766, 0x767, 0x788, 0x788, 0x786, 0x788,
-      0x754, 0x755, 0x766, 0x787, 0x787, 0x796, 0x787, 0x797,
-      0x684, 0x695, 0x696, 0x6a5, 0x786, 0x786, 0x795, 0x796,
-      0x684, 0x695, 0x686, 0x6a6, 0x786, 0x796, 0x786, 0x796,
-      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x847, 0x848,
-      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x857, 0x858,
-      0x844, 0x865, 0x846, 0x847, 0x868, 0x849, 0x866, 0x867,
-      0x844, 0x865, 0x846, 0x847, 0x878, 0x849, 0x877, 0x877,
-      0x844, 0x845, 0x846, 0x867, 0x885, 0x887, 0x885, 0x887,
-      0x784, 0x785, 0x786, 0x877, 0x885, 0x885, 0x896, 0x896,
-      0x684, 0x695, 0x696, 0x885, 0x896, 0x885, 0x895, 0x895,
-      0x694, 0x685, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
-      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x848, 0x848,
-      0x944, 0x855, 0x756, 0x947, 0x858, 0x857, 0x858, 0x858,
-      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x866, 0x867,
-      0x944, 0x775, 0x976, 0x877, 0x877, 0x878, 0x877, 0x877,
-      0x784, 0x785, 0x886, 0x887, 0x886, 0x986, 0x887, 0x887,
-      0x784, 0x785, 0x786, 0x796, 0x885, 0x897, 0x896, 0x897,
-      0x684, 0x695, 0x6a4, 0x886, 0x886, 0x896, 0x896, 0x896,
-      0x6a4, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
-      0x844, 0x845, 0x846, 0x847, 0x847, 0x847, 0x847, 0x847,
-      0x854, 0x855, 0x856, 0x857, 0x857, 0x858, 0x857, 0x857,
-      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x867, 0x867,
-      0x774, 0x775, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
+      0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
+      0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
+      0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
+      0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
+      0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
+      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
+      0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
+      0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
+      0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
+      0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
+      0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
+      0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
+      0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
+      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
+      0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
+      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
+      0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
+      0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
+      0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
+      0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
+      0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
+      0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+      0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
       0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
       0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
-      0x684, 0x6a5, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
-      0x694, 0x6a5, 0x6a5, 0x886, 0xa68, 0x896, 0x896, 0x896,
-      0x844, 0x845, 0x846, 0x846, 0x847, 0x945, 0x847, 0x946,
-      0x854, 0xb55, 0x856, 0x857, 0x857, 0x856, 0x857, 0x857,
-      0x864, 0x865, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
+      0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
+      0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
+      0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
+      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
       0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
-      0x784, 0x885, 0x886, 0x886, 0x886, 0x887, 0x887, 0x887,
-      0x784, 0x785, 0x786, 0x786, 0x886, 0x897, 0x897, 0x897,
+      0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
+      0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
       0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
-      0x684, 0x6a5, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
+      0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
     };
     return data;
   }

From f218c0181d44d7dd129a77108ad6ec063cfbd6cc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 16 Mar 2015 13:05:00 -0700
Subject: [PATCH 65/83] Fixes the Lvalue computation by actually setting the
 LvalueBit properly when instantiating tensors of const T. Added a test to
 check the fix.

---
 .../Eigen/CXX11/src/Tensor/TensorTraits.h     |  8 ++--
 unsupported/test/cxx11_tensor_ref.cpp         | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index a844a4d68..66ddfd554 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -54,7 +54,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
   };
 };
 
@@ -69,7 +69,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit),
   };
 };
 
@@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = Options_,
-    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
   };
 };
 
@@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = BaseTraits::Options,
-    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+    Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
   };
 };
 
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index aa369f278..c7b5ecddb 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -196,6 +196,45 @@ static void test_coeff_ref()
 }
 
 
+static void test_nested_ops_with_ref()
+{
+  Tensor<float, 4> t(2, 3, 5, 7);
+  t.setRandom();
+  TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
+  array<pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = make_pair(0, 0);
+  paddings[1] = make_pair(2, 1);
+  paddings[2] = make_pair(3, 4);
+  paddings[3] = make_pair(0, 0);
+  Eigen::DSizes<Eigen::DenseIndex, 4> shuffle_dims{0, 1, 2, 3};
+  TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
+  array<pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = make_pair(0, 0);
+  trivial[1] = make_pair(0, 0);
+  trivial[2] = make_pair(0, 0);
+  trivial[3] = make_pair(0, 0);
+  Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+
 void test_cxx11_tensor_ref()
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
@@ -205,4 +244,5 @@ void test_cxx11_tensor_ref()
   CALL_SUBTEST(test_ref_of_ref());
   CALL_SUBTEST(test_ref_in_expr());
   CALL_SUBTEST(test_coeff_ref());
+  CALL_SUBTEST(test_nested_ops_with_ref());
 }

From eb6929cb191c1b91dda784697faedcfdb245345a Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 16 Mar 2015 16:15:47 -0400
Subject: [PATCH 66/83] fix bug in maxsize calculation, which would cause
 products of size > 2048 to address the lookup table out of bounds

---
 Eigen/src/Core/products/LookupBlockingSizesTable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
index f955253f8..3c8aba6f8 100644
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -59,7 +59,7 @@ struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
     using std::max;
     typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
     const uint16_t minsize = Table::BaseSize;
-    const uint16_t maxsize = minsize << (Table::NumSizes + 1);
+    const uint16_t maxsize = minsize << (Table::NumSizes - 1);
     const uint16_t k_clamped = max<uint16_t>(minsize, min<Index>(k, maxsize));
     const uint16_t m_clamped = max<uint16_t>(minsize, min<Index>(m, maxsize));
     const uint16_t n_clamped = max<uint16_t>(minsize, min<Index>(n, maxsize));

From 0fd6d52724555d70f663c7ec56db6419e95db6cc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 16 Mar 2015 13:16:12 -0700
Subject: [PATCH 67/83] Fixed compilation error with clang

---
 unsupported/test/cxx11_tensor_ref.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index c7b5ecddb..59530fe43 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -201,18 +201,18 @@ static void test_nested_ops_with_ref()
   Tensor<float, 4> t(2, 3, 5, 7);
   t.setRandom();
   TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
-  array<pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
-  paddings[0] = make_pair(0, 0);
-  paddings[1] = make_pair(2, 1);
-  paddings[2] = make_pair(3, 4);
-  paddings[3] = make_pair(0, 0);
-  Eigen::DSizes<Eigen::DenseIndex, 4> shuffle_dims{0, 1, 2, 3};
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims{0, 1, 2, 3};
   TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
-  array<pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
-  trivial[0] = make_pair(0, 0);
-  trivial[1] = make_pair(0, 0);
-  trivial[2] = make_pair(0, 0);
-  trivial[3] = make_pair(0, 0);
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = std::make_pair(0, 0);
+  trivial[1] = std::make_pair(0, 0);
+  trivial[2] = std::make_pair(0, 0);
+  trivial[3] = std::make_pair(0, 0);
   Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
   VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
   VERIFY_IS_EQUAL(padded.dimension(1), 3+3);

From 5144f66728fb156402799682cfd926faf0e98fb4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 16 Mar 2015 13:17:52 -0700
Subject: [PATCH 68/83] Fixed compilation warning

---
 unsupported/test/cxx11_tensor_ref.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index 59530fe43..c8f105e3d 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -206,7 +206,7 @@ static void test_nested_ops_with_ref()
   paddings[1] = std::make_pair(2, 1);
   paddings[2] = std::make_pair(3, 4);
   paddings[3] = std::make_pair(0, 0);
-  DSizes<Eigen::DenseIndex, 4> shuffle_dims{0, 1, 2, 3};
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3);
   TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
   array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
   trivial[0] = std::make_pair(0, 0);

From 577056aa9400459f1a1bad9e423857bc1763f18f Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 16 Mar 2015 16:21:50 -0400
Subject: [PATCH 69/83] Include stdint.h. Not going for cstdint because it is a
 C++11 addition. Needed for uint16_t at least, in lookup-table code.

---
 Eigen/Core | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Eigen/Core b/Eigen/Core
index 1a3249604..80842a1de 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -61,6 +61,7 @@
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
+#include <stdint.h>
 #include <complex>
 
 // this include file manages BLAS and MKL related macros

From 364cfd529d853545e6b0a7404fe303cda2de2366 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 16 Mar 2015 16:28:44 -0400
Subject: [PATCH 70/83] Similar to cset
 3589a9c115a892ea3ca5dac74d71a1526764cb38 , also in 2px4 kernel:
 actual_panel_rows computation should always be resilient to parameters not
 consistent with the known L1 cache size, see comment

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index bc24a17ac..d32377a00 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1253,11 +1253,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     if(mr>=2*Traits::LhsProgress)
     {
       const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
       Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
-#else
-      Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) );
-#endif
+
       for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
         Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);

From dc04f12967a955125fa2eeb8ed4f1532f8ee3615 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 17 Mar 2015 10:31:45 -0400
Subject: [PATCH 71/83] use unsigned short instead of uint16_t which doesn't
 exist in c++98

---
 Eigen/Core                                    |  1 -
 .../arch/NEON/BlockingSizesLookupTables.h     |  4 ++--
 .../Core/products/LookupBlockingSizesTable.h  | 20 +++++++++----------
 bench/analyze-blocking-sizes.cpp              |  4 ++--
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 80842a1de..1a3249604 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -61,7 +61,6 @@
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
-#include <stdint.h>
 #include <complex>
 
 // this include file manages BLAS and MKL related macros
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
index 7905bdf83..5007c155d 100644
--- a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
+++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
@@ -32,8 +32,8 @@ template<>
 struct BlockingSizesLookupTable<float, float> {
   static const size_t BaseSize = 16;
   static const size_t NumSizes = 8;
-  static const uint16_t* Data() {
-    static const uint16_t data[512] = {
+  static const unsigned short* Data() {
+    static const unsigned short data[512] = {
       0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
       0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
       0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
index 3c8aba6f8..5ab4525df 100644
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -25,9 +25,9 @@ struct LookupBlockingSizesFromTableImpl
   }
 };
 
-inline uint8_t floor_log2_helper(uint16_t& x, size_t offset)
+inline size_t floor_log2_helper(unsigned short& x, size_t offset)
 {
-  uint16_t y = x >> offset;
+  unsigned short y = x >> offset;
   if (y) {
     x = y;
     return offset;
@@ -36,7 +36,7 @@ inline uint8_t floor_log2_helper(uint16_t& x, size_t offset)
   }
 }
 
-inline uint8_t floor_log2(uint16_t x)
+inline size_t floor_log2(unsigned short x)
 {
   return floor_log2_helper(x, 8)
        + floor_log2_helper(x, 4)
@@ -44,7 +44,7 @@ inline uint8_t floor_log2(uint16_t x)
        + floor_log2_helper(x, 1);
 }
 
-inline uint8_t ceil_log2(uint16_t x)
+inline size_t ceil_log2(unsigned short x)
 {
   return x > 1 ? floor_log2(x - 1) + 1 : 0;
 }
@@ -58,16 +58,16 @@ struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
     using std::min;
     using std::max;
     typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
-    const uint16_t minsize = Table::BaseSize;
-    const uint16_t maxsize = minsize << (Table::NumSizes - 1);
-    const uint16_t k_clamped = max<uint16_t>(minsize, min<Index>(k, maxsize));
-    const uint16_t m_clamped = max<uint16_t>(minsize, min<Index>(m, maxsize));
-    const uint16_t n_clamped = max<uint16_t>(minsize, min<Index>(n, maxsize));
+    const unsigned short minsize = Table::BaseSize;
+    const unsigned short maxsize = minsize << (Table::NumSizes - 1);
+    const unsigned short k_clamped = max<unsigned short>(minsize, min<Index>(k, maxsize));
+    const unsigned short m_clamped = max<unsigned short>(minsize, min<Index>(m, maxsize));
+    const unsigned short n_clamped = max<unsigned short>(minsize, min<Index>(n, maxsize));
     const size_t k_index = ceil_log2(k_clamped / minsize);
     const size_t m_index = ceil_log2(m_clamped / minsize);
     const size_t n_index = ceil_log2(n_clamped / minsize);
     const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
-    const uint16_t table_entry = Table::Data()[index];
+    const unsigned short table_entry = Table::Data()[index];
     k = min<Index>(k, 1 << ((table_entry & 0xf00) >> 8));
     m = min<Index>(m, 1 << ((table_entry & 0x0f0) >> 4));
     n = min<Index>(n, 1 << ((table_entry & 0x00f) >> 0));
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index 3d57f4bb3..d563a1d2d 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -355,8 +355,8 @@ void dump_table_for_subset(
   const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
   const size_t TableSize = NumSizes * NumSizes * NumSizes;
   cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
-  cout << "  static const uint16_t* Data() {" << endl;
-  cout << "    static const uint16_t data[" << TableSize << "] = {";
+  cout << "  static const unsigned short* Data() {" << endl;
+  cout << "    static const unsigned short data[" << TableSize << "] = {";
   while (entry_index < num_entries) {
     ++entry_index;
     if (entry_index == num_entries ||

From cc0f89eb3b07c65efb2b73890e4e7ac83525700a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 17 Mar 2015 09:57:20 -0700
Subject: [PATCH 72/83] Changed the way lvalue operations are declared in
 TensorBase: this fixes constness isses that prevented some expressions mixing
 lvalues and rvalues from compiling.

---
 .../Eigen/CXX11/src/Tensor/TensorBase.h       | 73 ++++++++++++++++---
 unsupported/test/cxx11_tensor_const.cpp       | 27 ++++++-
 2 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 201b0fc9e..86e72c3a4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -520,48 +520,101 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorLayoutSwapOp<Derived>
+    const TensorLayoutSwapOp<const Derived>
     swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
       return TensorLayoutSwapOp<Derived>(derived());
     }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorConcatenationOp<const Axis, Derived, OtherDerived>
-    concatenate(const OtherDerived& other, const Axis& axis) const {
-      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other.derived(), axis);
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
     }
     template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReshapingOp<const NewDimensions, Derived>
-    reshape(const NewDimensions& newDimensions) const {
+    reshape(const NewDimensions& newDimensions) {
       return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
     }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
     template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorSlicingOp<const StartIndices, const Sizes, Derived>
-    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
       return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
     }
+
     template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorChippingOp<DimId, Derived>
+    const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
       return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
     }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorChippingOp<Dynamic, Derived>
-    chip(const Index offset, const Index dim) const {
+    chip(const Index offset, const Index dim) {
       return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
     }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
     template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorReverseOp<const ReverseDimensions, Derived>
-    reverse(const ReverseDimensions& rev) const {
+    reverse(const ReverseDimensions& rev) {
       return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
     }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shuffle) const {
+    shuffle(const Shuffle& shuffle) {
       return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
     }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorStridingOp<const Strides, Derived>
-    stride(const Strides& strides) const {
+    stride(const Strides& strides) {
       return TensorStridingOp<const Strides, Derived>(derived(), strides);
     }
 
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index 0ffb02afd..ad9c9da39 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -13,8 +13,6 @@
 using Eigen::Tensor;
 
 
-
-
 static void test_simple_assign()
 {
   Tensor<int, 3> random(2,3,7);
@@ -33,7 +31,32 @@ static void test_simple_assign()
   }
 }
 
+
+static void test_assign_of_const_tensor()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7);
+  const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7);
+
+  Tensor<int, 2> result1 = constant1.chip(0, 2);
+  Tensor<int, 2> result2 = constant2.chip(0, 2);
+  Tensor<int, 2> result3 = constant3.chip(0, 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0));
+    }
+  }
+}
+
+
 void test_cxx11_tensor_const()
 {
   CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_assign_of_const_tensor());
 }

From f329d0908af35fd17bdc4dfeb87046dcaa6e6937 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Mar 2015 15:10:36 +0100
Subject: [PATCH 73/83] Improve random number generation for integer and add
 unit test

---
 Eigen/src/Core/MathFunctions.h | 32 ++++++-------
 test/CMakeLists.txt            |  1 +
 test/rand.cpp                  | 88 ++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 18 deletions(-)
 create mode 100644 test/rand.cpp

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 0fde5c71e..e1b233d82 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -522,28 +522,24 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
-
   static inline Scalar run(const Scalar& x, const Scalar& y)
   { 
     using std::max;
-    Scalar range = (max)(Scalar(0),Scalar(y-x));
-    Scalar offset = 0;
-    if(range<=RAND_MAX)
-    {
-      // rejection sampling
-      int divisor = RAND_MAX/(range+1);
+    using std::min;
+    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if(y<x)
+      return x;
+    std::size_t range = ScalarX(y)-ScalarX(x);
+    std::size_t offset = 0;
+    // rejection sampling
+    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
+    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
 
-      do {
-        offset = Scalar(std::rand() / divisor);
-      } while (offset > range);
-    }
-    else
-    {
-      offset = std::rand() * range;
-    }
-    
-    return x + offset;
+    do {
+      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+    } while (offset > range);
+
+    return Scalar(ScalarX(x) + offset);
   }
 
   static inline Scalar run()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1712b8718..734a0eb9b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -139,6 +139,7 @@ endif(TEST_LIB)
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
 
+ei_add_test(rand)
 ei_add_test(meta)
 ei_add_test(sizeof)
 ei_add_test(dynalloc)
diff --git a/test/rand.cpp b/test/rand.cpp
new file mode 100644
index 000000000..4e090cbad
--- /dev/null
+++ b/test/rand.cpp
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename Scalar> Scalar check_in_range(Scalar x, Scalar y)
+{
+  Scalar r = internal::random<Scalar>(x,y);
+  VERIFY(r>=x);
+  if(y>=x)
+  {
+    VERIFY(r<=y);
+  }
+  return r;
+}
+
+template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
+{
+  Array<int,1,Dynamic> mask(y-x+1);
+  mask.fill(0);
+  long n = (y-x+1)*32;
+  for(long k=0; k<n; ++k)
+  {
+    mask( check_in_range(x,y)-x )++;
+  }
+  VERIFY( (mask>0).all() );
+}
+
+void test_rand()
+{
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST(check_in_range<float>(10,11));
+    CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<float>(-1,1));
+    CALL_SUBTEST(check_in_range<float>(-1432.2352,-1432.2352));
+    
+    CALL_SUBTEST(check_in_range<double>(10,11));
+    CALL_SUBTEST(check_in_range<double>(1.24234523,1.24234523));
+    CALL_SUBTEST(check_in_range<double>(-1,1));
+    CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
+    
+    
+    CALL_SUBTEST(check_in_range<int>(0,-1));
+    CALL_SUBTEST(check_in_range<short>(0,-1));
+    CALL_SUBTEST(check_in_range<long>(0,-1));
+    CALL_SUBTEST(check_in_range<int>(-673456,673456));
+    CALL_SUBTEST(check_in_range<short>(-24345,24345));
+    CALL_SUBTEST(check_in_range<long>(-6734565664234,6734565664234));
+  }
+  
+  char char_offset = (std::min)(g_repeat,64);
+  CALL_SUBTEST(check_all_in_range<char>(11,11));
+  CALL_SUBTEST(check_all_in_range<char>(11,11+char_offset));
+  CALL_SUBTEST(check_all_in_range<char>(-5,5));
+  CALL_SUBTEST(check_all_in_range<char>(-11-char_offset,-11));
+  CALL_SUBTEST(check_all_in_range<char>(-126,-126+char_offset));
+  CALL_SUBTEST(check_all_in_range<char>(126-char_offset,126));
+  CALL_SUBTEST(check_all_in_range<char>(-126,126));
+  
+  char short_offset = (std::min)(g_repeat,16000);
+  CALL_SUBTEST(check_all_in_range<short>(11,11));
+  CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(-5,5));
+  CALL_SUBTEST(check_all_in_range<short>(-11-short_offset,-11));
+  CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
+  CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
+  
+    
+  CALL_SUBTEST(check_all_in_range<int>(11,11));
+  CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(-5,5));
+  CALL_SUBTEST(check_all_in_range<int>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<int>(-673456,-673456+g_repeat));
+  CALL_SUBTEST(check_all_in_range<int>(673456,673456+g_repeat));
+  
+  CALL_SUBTEST(check_all_in_range<long>(11,11));
+  CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(-5,5));
+  CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
+  CALL_SUBTEST(check_all_in_range<long>(-6734565664234,-6734565664234+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(6734565664234,6734565664234+g_repeat));
+}

From d7698c18b7801f041c36adffcdfaefc99140887f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Mar 2015 15:11:05 +0100
Subject: [PATCH 74/83] Split sparse_basic unit test

---
 test/CMakeLists.txt   |   1 +
 test/sparse_basic.cpp | 204 +--------------------------------
 test/sparse_block.cpp | 254 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+), 200 deletions(-)
 create mode 100644 test/sparse_block.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 734a0eb9b..393c35b57 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -227,6 +227,7 @@ ei_add_test(stdvector_overload)
 ei_add_test(stdlist)
 ei_add_test(stddeque)
 ei_add_test(sparse_basic)
+ei_add_test(sparse_block)
 ei_add_test(sparse_vector)
 ei_add_test(sparse_product)
 ei_add_test(sparse_ref)
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index d929e1463..75f29a2b4 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -30,7 +30,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
   double density = (std::max)(8./(rows*cols), 0.01);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
   Scalar eps = 1e-6;
 
   Scalar s1 = internal::random<Scalar>();
@@ -59,77 +58,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
 
     VERIFY_IS_APPROX(m, refMat);
 
-      // test InnerIterators and Block expressions
-      for (int t=0; t<10; ++t)
-      {
-        Index j = internal::random<Index>(0,cols-2);
-        Index i = internal::random<Index>(0,rows-2);
-        Index w = internal::random<Index>(1,cols-j);
-        Index h = internal::random<Index>(1,rows-i);
-
-        VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
-        for(Index c=0; c<w; c++)
-        {
-          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
-          for(Index r=0; r<h; r++)
-          {
-            VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
-            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
-          }
-        }
-        for(Index r=0; r<h; r++)
-        {
-          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
-          for(Index c=0; c<w; c++)
-          {
-            VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
-            VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
-          }
-        }
-        
-        VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
-        VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
-        for(Index r=0; r<h; r++)
-        {
-          VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
-          VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
-          for(Index c=0; c<w; c++)
-          {
-            VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
-            VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
-            
-            VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
-            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
-            if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
-            {
-              VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
-            }
-            if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
-            {
-              VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
-            }
-          }
-        }
-        for(Index c=0; c<w; c++)
-        {
-          VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
-          VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
-        }
-      }
-
-      for(Index c=0; c<cols; c++)
-      {
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
-        VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
-      }
-
-      for(Index r=0; r<rows; r++)
-      {
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
-        VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
-      }
-      
-
       // test assertion
       VERIFY_RAISES_ASSERT( m.coeffRef(-1,1) = 0 );
       VERIFY_RAISES_ASSERT( m.coeffRef(0,m.cols()) = 0 );
@@ -214,82 +142,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       VERIFY_IS_APPROX(m2,m1);
     }
 
-  // test innerVector()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,outer-1);
-    Index j1 = internal::random<Index>(0,outer-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
-
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
-
-    SparseMatrixType m3(rows,cols);
-    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
-    for(Index j=0; j<outer; ++j)
-      for(Index k=0; k<(std::min)(j,inner); ++k)
-        m3.insertByOuterInner(j,k) = k+1;
-    for(Index j=0; j<(std::min)(outer, inner); ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-    m3.makeCompressed();
-    for(Index j=0; j<(std::min)(outer, inner); ++j)
-    {
-      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
-      if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
-    }
-
-    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
-
-//     m2.innerVector(j0) = 2*m2.innerVector(j1);
-//     refMat2.col(j0) = 2*refMat2.col(j1);
-//     VERIFY_IS_APPROX(m2, refMat2);
-  }
-
-  // test innerVectors()
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
-    Index j0 = internal::random<Index>(0,outer-2);
-    Index j1 = internal::random<Index>(0,outer-2);
-    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
-    else
-      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-    
-    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
-    
-    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
-    if(SparseMatrixType::IsRowMajor)
-      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
-    else
-      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
-    
-    VERIFY_IS_APPROX(m2, refMat2);
-  }
-
   // test basic computations
   {
     DenseMatrix refM1 = DenseMatrix::Zero(rows, cols);
@@ -360,54 +212,6 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY(m2.isApprox(m3));
   }
 
-  
-  
-  // test generic blocks
-  {
-    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
-    SparseMatrixType m2(rows, cols);
-    initSparse<Scalar>(density, refMat2, m2);
-    Index j0 = internal::random<Index>(0,outer-2);
-    Index j1 = internal::random<Index>(0,outer-2);
-    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
-    
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
-                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
-    else
-      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
-                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
-      
-    Index i = internal::random<Index>(0,m2.outerSize()-1);
-    if(SparseMatrixType::IsRowMajor) {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.row(i) = refMat2.row(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    } else {
-      m2.innerVector(i) = m2.innerVector(i) * s1;
-      refMat2.col(i) = refMat2.col(i) * s1;
-      VERIFY_IS_APPROX(m2,refMat2);
-    }
-    
-    Index r0 = internal::random<Index>(0,rows-2);
-    Index c0 = internal::random<Index>(0,cols-2);
-    Index r1 = internal::random<Index>(1,rows-r0);
-    Index c1 = internal::random<Index>(1,cols-c0);
-    
-    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
-    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
-    
-    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
-    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
-
-    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
-    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
-  }
-
   // test prune
   {
     SparseMatrixType m2(rows, cols);
@@ -646,8 +450,8 @@ void test_sparse_basic()
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
     CALL_SUBTEST_2(( sparse_basic(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
     CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double>(r, c)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_5(( sparse_basic(SparseMatrix<double,RowMajor,long int>(r, c)) ));
     
     r = Eigen::internal::random<int>(1,100);
     c = Eigen::internal::random<int>(1,100);
@@ -655,8 +459,8 @@ void test_sparse_basic()
       r = c; // check square matrices in 25% of tries
     }
     
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
-    CALL_SUBTEST_1(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_6(( sparse_basic(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
   }
 
   // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
new file mode 100644
index 000000000..8a6e0687c
--- /dev/null
+++ b/test/sparse_block.cpp
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "sparse.h"
+
+template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  const Index inner = ref.innerSize();
+  const Index outer = ref.outerSize();
+
+  typedef typename SparseMatrixType::Scalar Scalar;
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+
+  Scalar s1 = internal::random<Scalar>();
+  {
+    SparseMatrixType m(rows, cols);
+    DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+    initSparse<Scalar>(density, refMat, m);
+
+    VERIFY_IS_APPROX(m, refMat);
+
+    // test InnerIterators and Block expressions
+    for (int t=0; t<10; ++t)
+    {
+      Index j = internal::random<Index>(0,cols-2);
+      Index i = internal::random<Index>(0,rows-2);
+      Index w = internal::random<Index>(1,cols-j);
+      Index h = internal::random<Index>(1,rows-i);
+
+      VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w));
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).col(c), refMat.block(i,j,h,w).col(c));
+        for(Index r=0; r<h; r++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).col(c).coeff(r), refMat.block(i,j,h,w).col(c).coeff(r));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.block(i,j,h,w).row(r), refMat.block(i,j,h,w).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.block(i,j,h,w).row(r).coeff(c), refMat.block(i,j,h,w).row(r).coeff(c));
+          VERIFY_IS_APPROX(m.block(i,j,h,w).coeff(r,c), refMat.block(i,j,h,w).coeff(r,c));
+        }
+      }
+      
+      VERIFY_IS_APPROX(m.middleCols(j,w), refMat.middleCols(j,w));
+      VERIFY_IS_APPROX(m.middleRows(i,h), refMat.middleRows(i,h));
+      for(Index r=0; r<h; r++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).row(r), refMat.middleCols(j,w).row(r));
+        VERIFY_IS_APPROX(m.middleRows(i,h).row(r), refMat.middleRows(i,h).row(r));
+        for(Index c=0; c<w; c++)
+        {
+          VERIFY_IS_APPROX(m.col(c).coeff(r), refMat.col(c).coeff(r));
+          VERIFY_IS_APPROX(m.row(r).coeff(c), refMat.row(r).coeff(c));
+          
+          VERIFY_IS_APPROX(m.middleCols(j,w).coeff(r,c), refMat.middleCols(j,w).coeff(r,c));
+          VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          if(m.middleCols(j,w).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleCols(j,w).coeffRef(r,c), refMat.middleCols(j,w).coeff(r,c));
+          }
+          if(m.middleRows(i,h).coeff(r,c) != Scalar(0))
+          {
+            VERIFY_IS_APPROX(m.middleRows(i,h).coeff(r,c), refMat.middleRows(i,h).coeff(r,c));
+          }
+        }
+      }
+      for(Index c=0; c<w; c++)
+      {
+        VERIFY_IS_APPROX(m.middleCols(j,w).col(c), refMat.middleCols(j,w).col(c));
+        VERIFY_IS_APPROX(m.middleRows(i,h).col(c), refMat.middleRows(i,h).col(c));
+      }
+    }
+
+    for(Index c=0; c<cols; c++)
+    {
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), (m + m).col(c));
+      VERIFY_IS_APPROX(m.col(c) + m.col(c), refMat.col(c) + refMat.col(c));
+    }
+
+    for(Index r=0; r<rows; r++)
+    {
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), (m + m).row(r));
+      VERIFY_IS_APPROX(m.row(r) + m.row(r), refMat.row(r) + refMat.row(r));
+    }
+  }
+
+  // test innerVector()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-1);
+    Index j1 = internal::random<Index>(0,outer-1);
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
+    else
+      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
+
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
+    else
+      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
+
+    SparseMatrixType m3(rows,cols);
+    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
+    for(Index j=0; j<outer; ++j)
+      for(Index k=0; k<(std::min)(j,inner); ++k)
+        m3.insertByOuterInner(j,k) = k+1;
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+    m3.makeCompressed();
+    for(Index j=0; j<(std::min)(outer, inner); ++j)
+    {
+      VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
+      if(j>0)
+        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+    }
+
+    VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
+
+//     m2.innerVector(j0) = 2*m2.innerVector(j1);
+//     refMat2.col(j0) = 2*refMat2.col(j1);
+//     VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test innerVectors()
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                       refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0));
+    else
+      VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+    
+    VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros());
+    
+    m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0);
+    if(SparseMatrixType::IsRowMajor)
+      refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval();
+    else
+      refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval();
+    
+    VERIFY_IS_APPROX(m2, refMat2);
+  }
+
+  // test generic blocks
+  {
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    Index j0 = internal::random<Index>(0,outer-2);
+    Index j1 = internal::random<Index>(0,outer-2);
+    Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0));
+    
+    if(SparseMatrixType::IsRowMajor)
+      VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols),
+                      refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols));
+    else
+      VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0),
+                      refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0));
+      
+    Index i = internal::random<Index>(0,m2.outerSize()-1);
+    if(SparseMatrixType::IsRowMajor) {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.row(i) = refMat2.row(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    } else {
+      m2.innerVector(i) = m2.innerVector(i) * s1;
+      refMat2.col(i) = refMat2.col(i) * s1;
+      VERIFY_IS_APPROX(m2,refMat2);
+    }
+    
+    Index r0 = internal::random<Index>(0,rows-2);
+    Index c0 = internal::random<Index>(0,cols-2);
+    Index r1 = internal::random<Index>(1,rows-r0);
+    Index c1 = internal::random<Index>(1,cols-c0);
+    
+    VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0));
+    VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0));
+    
+    VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0));
+    VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0));
+
+    VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1));
+    VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1));
+  }
+}
+
+void test_sparse_block()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(1, 1)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(8, 8)) ));
+    CALL_SUBTEST_1(( sparse_block(SparseMatrix<double>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, ColMajor>(r, c)) ));
+    CALL_SUBTEST_2(( sparse_block(SparseMatrix<std::complex<double>, RowMajor>(r, c)) ));
+    
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,ColMajor,long int>(r, c)) ));
+    CALL_SUBTEST_3(( sparse_block(SparseMatrix<double,RowMajor,long int>(r, c)) ));
+    
+    r = Eigen::internal::random<int>(1,100);
+    c = Eigen::internal::random<int>(1,100);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
+    CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+  }
+}

From 61c45d7cfd29a161abefd095107b72c75db76ffc Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Mar 2015 17:13:22 +0100
Subject: [PATCH 75/83] Fix comparison warning

---
 Eigen/src/Core/DenseStorage.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 522aaa299..ab41641f4 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -36,8 +36,8 @@ void check_static_allocation_size()
 
 template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
          bool Match     = bool((Size%unpacket_traits<Packet>::size)==0),
-         bool TryHalf   =  bool(unpacket_traits<Packet>::size > Size)
-                        && bool(unpacket_traits<Packet>::size > unpacket_traits<typename unpacket_traits<Packet>::half>::size) >
+         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > Size)
+                        && bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) >
 struct compute_default_alignment
 {
   enum { value = 0 };

From d6b2f300dbab3f11f6f8bd28558d2b91758ad514 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Mar 2015 17:28:32 +0100
Subject: [PATCH 76/83] Fix MSVC compilation: aligned type must be passed by
 reference

---
 Eigen/src/Core/util/BlasUtil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 9bfa45106..ffeb5ac5f 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -214,7 +214,7 @@ class blas_data_mapper {
   }
 
   template<typename SubPacket>
-  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
   }
 

From 9ee62fdcd51e78a7e5f53c5868680fe0b6b261e4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Mar 2015 21:39:37 +0100
Subject: [PATCH 77/83] Fix random unit test for 32bits systems.

---
 test/rand.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/rand.cpp b/test/rand.cpp
index 4e090cbad..7c8068a3b 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -34,6 +34,10 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
 
 void test_rand()
 {
+  long long_ref = NumTraits<long>::highest()/10;
+  char char_offset = (std::min)(g_repeat,64);
+  char short_offset = (std::min)(g_repeat,16000);
+  
   for(int i = 0; i < g_repeat*10; i++) {
     CALL_SUBTEST(check_in_range<float>(10,11));
     CALL_SUBTEST(check_in_range<float>(1.24234523,1.24234523));
@@ -45,16 +49,14 @@ void test_rand()
     CALL_SUBTEST(check_in_range<double>(-1,1));
     CALL_SUBTEST(check_in_range<double>(-1432.2352,-1432.2352));
     
-    
     CALL_SUBTEST(check_in_range<int>(0,-1));
     CALL_SUBTEST(check_in_range<short>(0,-1));
     CALL_SUBTEST(check_in_range<long>(0,-1));
     CALL_SUBTEST(check_in_range<int>(-673456,673456));
     CALL_SUBTEST(check_in_range<short>(-24345,24345));
-    CALL_SUBTEST(check_in_range<long>(-6734565664234,6734565664234));
+    CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
   }
   
-  char char_offset = (std::min)(g_repeat,64);
   CALL_SUBTEST(check_all_in_range<char>(11,11));
   CALL_SUBTEST(check_all_in_range<char>(11,11+char_offset));
   CALL_SUBTEST(check_all_in_range<char>(-5,5));
@@ -63,7 +65,6 @@ void test_rand()
   CALL_SUBTEST(check_all_in_range<char>(126-char_offset,126));
   CALL_SUBTEST(check_all_in_range<char>(-126,126));
   
-  char short_offset = (std::min)(g_repeat,16000);
   CALL_SUBTEST(check_all_in_range<short>(11,11));
   CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
   CALL_SUBTEST(check_all_in_range<short>(-5,5));
@@ -71,7 +72,6 @@ void test_rand()
   CALL_SUBTEST(check_all_in_range<short>(-24345,-24345+short_offset));
   CALL_SUBTEST(check_all_in_range<short>(24345,24345+short_offset));
   
-    
   CALL_SUBTEST(check_all_in_range<int>(11,11));
   CALL_SUBTEST(check_all_in_range<int>(11,11+g_repeat));
   CALL_SUBTEST(check_all_in_range<int>(-5,5));
@@ -83,6 +83,6 @@ void test_rand()
   CALL_SUBTEST(check_all_in_range<long>(11,11+g_repeat));
   CALL_SUBTEST(check_all_in_range<long>(-5,5));
   CALL_SUBTEST(check_all_in_range<long>(-11-g_repeat,-11));
-  CALL_SUBTEST(check_all_in_range<long>(-6734565664234,-6734565664234+g_repeat));
-  CALL_SUBTEST(check_all_in_range<long>(6734565664234,6734565664234+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>(-long_ref,-long_ref+g_repeat));
+  CALL_SUBTEST(check_all_in_range<long>( long_ref, long_ref+g_repeat));
 }

From e134226a0352b0951b03e8ea56ce3bbd538e73db Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 19 Mar 2015 23:11:42 -0700
Subject: [PATCH 78/83] Fixed a bug in the handling of packets by the
 MeanReducer

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 38586d067..25f085a59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -77,7 +77,7 @@ template <typename T> struct MeanReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size);
+    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
   }
 
   protected:

From a6a628ca6b3c0d0dd6716d200ba8e7740847168a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 19 Mar 2015 23:22:19 -0700
Subject: [PATCH 79/83] Added the -= operator to the device classes

---
 .../Eigen/CXX11/src/Tensor/TensorDevice.h     | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 649bdb308..7a67c56b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -21,8 +21,7 @@ namespace Eigen {
   * Example:
   *    C.device(EIGEN_GPU) = A + B;
   *
-  * Todo: thread pools.
-  * Todo: operator +=, -=, *= and so on.
+  * Todo: operator *= and /=.
   */
 
 template <typename ExpressionType, typename DeviceType> class TensorDevice {
@@ -50,6 +49,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const DeviceType& m_device;
     ExpressionType& m_expression;
@@ -82,6 +93,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const ThreadPoolDevice& m_device;
     ExpressionType& m_expression;
@@ -114,6 +137,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+      Difference difference(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+      Assign assign(m_expression, difference);
+      static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const GpuDevice& m_device;
     ExpressionType m_expression;

From 8d9bfb3a7ba03a8eb12a71d7133217d768f7940a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 20 Mar 2015 16:00:10 +0100
Subject: [PATCH 80/83] fix loadMarket wrt Index versus int

---
 unsupported/Eigen/src/SparseExtra/MarketIO.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 25ff4228d..100e617b2 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -18,7 +18,7 @@ namespace Eigen {
 namespace internal 
 {
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value)
   {
     line >> i >> j >> value;
     i--;
@@ -31,7 +31,7 @@ namespace internal
       return false;
   }
   template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value)
+  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value)
   {
     Scalar valR, valI;
     line >> i >> j >> valR >> valI;

From 4e2b18d909f27f03f64c5fd8891f2e94c5e03802 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 20 Mar 2015 16:33:48 +0100
Subject: [PATCH 81/83] Update approx. minimum ordering method to push and keep
 structural empty diagonal elements to the bottom-right part of the matrix

---
 Eigen/src/OrderingMethods/Amd.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
index 3d2981f0c..63d996cb4 100644
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@@ -137,22 +137,27 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, Perm
     degree[i] = len[i];                 // degree of node i
   }
   mark = internal::cs_wclear<StorageIndex>(0, 0, w, n);         /* clear w */
-  elen[n] = -2;                         /* n is a dead element */
-  Cp[n] = -1;                           /* n is a root of assembly tree */
-  w[n] = 0;                             /* n is a dead element */
   
   /* --- Initialize degree lists ------------------------------------------ */
   for(i = 0; i < n; i++)
   {
+    bool has_diag = false;
+    for(p = Cp[i]; p<Cp[i+1]; ++p)
+      if(Ci[p]==i)
+      {
+        has_diag = true;
+        break;
+      }
+   
     d = degree[i];
-    if(d == 0)                         /* node i is empty */
+    if(d == 1)                      /* node i is empty */
     {
       elen[i] = -2;                 /* element i is dead */
       nel++;
       Cp[i] = -1;                   /* i is a root of assembly tree */
       w[i] = 0;
     }
-    else if(d > dense)                 /* node i is dense */
+    else if(d > dense || !has_diag)  /* node i is dense or has no structural diagonal element */
     {
       nv[i] = 0;                    /* absorb i into element n */
       elen[i] = -1;                 /* node i is dead */
@@ -168,6 +173,10 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, Perm
     }
   }
   
+  elen[n] = -2;                         /* n is a dead element */
+  Cp[n] = -1;                           /* n is a root of assembly tree */
+  w[n] = 0;                             /* n is a dead element */
+  
   while (nel < n)                         /* while (selecting pivots) do */
   {
     /* --- Select node of minimum approximate degree -------------------- */

From 4472f3e57884202b52c551da0b294c2883e2ccbf Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 23 Mar 2015 09:40:21 +0100
Subject: [PATCH 82/83] Avoid SVD: consider denormalized small numbers as zero
 when computing the rank of the matrix

---
 Eigen/src/SVD/SVDBase.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index b89393721..ad191085e 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -130,9 +130,10 @@ public:
   inline Index rank() const
   {
     using std::abs;
+    using std::max;
     eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
     if(m_singularValues.size()==0) return 0;
-    RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold();
+    RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
     while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
     return i+1;

From abdbe8562e889a0ca0877d607cfd5c4cbf937e3a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 24 Mar 2015 10:45:46 -0700
Subject: [PATCH 83/83] Fixed the CUDA packet primitives

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 19749c832..ceed1d1ef 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
 }
 #endif
 
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
   return make_double2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
   to[stride*2] = from.z;
   to[stride*3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
 }
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
 }
 
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(abs(a.x), abs(a.y));
+  return make_double2(fabs(a.x), fabs(a.y));
 }
 
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
   kernel.packet[3].z = tmp;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<double2,2>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;