From 20cac72b8228de6c129caa983b25facddad0e009 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 17 Feb 2015 22:58:32 +0100
Subject: [PATCH 01/18] Packet must be passed by const reference and not by
 value to avoid alignment issue.

---
 Eigen/src/Core/util/BlasUtil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 3ec55fad2..9bfa45106 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -166,7 +166,7 @@ class BlasLinearMapper {
     return ploadt<HalfPacket, AlignmentType>(m_data + i);
   }
 
-  EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
     pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
   }
 

From 24d65ac0b0121ad70984ce1871cf2a8116b42201 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 01:03:32 +0100
Subject: [PATCH 02/18] Removed redundant typedef which confused old gcc
 versions.

---
 test/sizeoverflow.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp
index 16d6f8d04..240d22294 100644
--- a/test/sizeoverflow.cpp
+++ b/test/sizeoverflow.cpp
@@ -18,8 +18,6 @@
     VERIFY(threw && "should have thrown bad_alloc: " #a);     \
   }
 
-typedef DenseIndex Index;
-
 template<typename MatrixType>
 void triggerMatrixBadAlloc(Index rows, Index cols)
 {

From d4eda014889541e3a22680bf236bf814a6fbc813 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:24:32 +0100
Subject: [PATCH 03/18] Big 957, workaround MSVC/ICC compilation issue

---
 Eigen/src/SparseCore/SparseBlock.h      | 6 ++++--
 Eigen/src/SparseCore/SparseMatrixBase.h | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 40dc1a2bd..acd82e926 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -292,7 +292,8 @@ const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatri
   * is col-major (resp. row-major).
   */
 template<typename Derived>
-Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
+typename SparseMatrixBase<Derived>::InnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
 {
   return Block<Derived,Dynamic,Dynamic,true>(derived(),
                                              IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
@@ -304,7 +305,8 @@ Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Inde
   * is col-major (resp. row-major). Read-only.
   */
 template<typename Derived>
-const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
+const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
 {
   return Block<const Derived,Dynamic,Dynamic,true>(derived(),
                                                   IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 9039ebcec..d76dfa33d 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -321,8 +321,10 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     const ConstInnerVectorReturnType innerVector(Index outer) const;
 
     // set of inner-vectors
-    Block<Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize);
-    const Block<const Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize) const;
+    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
+    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
 
     DenseMatrixType toDense() const
     {

From dc7e6acc05f6d546db401545582bdd13c0331596 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:26:25 +0100
Subject: [PATCH 04/18] Fix possible usage of a null pointer in CholmodSupport

---
 Eigen/src/CholmodSupport/CholmodSupport.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index 8ef0fb3b5..d2b0fb282 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -277,6 +277,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
@@ -298,6 +299,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
+        return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
       dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);

From eb563049f7041170976998273e4d48e96b11b08f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:26:48 +0100
Subject: [PATCH 05/18] Remove some dead stores.

---
 Eigen/src/Core/arch/SSE/MathFunctions.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 9ffba5b41..f86c0a39a 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -138,7 +138,6 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
   fx = _mm_floor_ps(fx);
 #else
-  tmp = _mm_setzero_ps();
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
   /* if greater, substract 1 */
@@ -207,7 +206,6 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 #ifdef EIGEN_VECTORIZE_SSE4_1
   fx = _mm_floor_pd(fx);
 #else
-  tmp = _mm_setzero_pd();
   emm0 = _mm_cvttpd_epi32(fx);
   tmp  = _mm_cvtepi32_pd(emm0);
   /* if greater, substract 1 */

From 63464754ef747e0c3d16c5da6fd4d4228ab8dd7a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:29:54 +0100
Subject: [PATCH 06/18] Add an internal assertion in makeCompressed to catch a
 possible risk of null-pointer access.

---
 Eigen/src/SparseCore/SparseMatrix.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4c8965802..4562f3df9 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -467,6 +467,8 @@ class SparseMatrix
       if(isCompressed())
         return;
       
+      eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
+      
       Index oldStart = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
       for(Index j=1; j<m_outerSize; ++j)

From 371d3bef3645193b9255a800ef69ddbb01eed9e4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:30:44 +0100
Subject: [PATCH 07/18] Workaround dead store warnings in unit tests.

---
 test/cholesky.cpp                |  7 +++++--
 test/eigensolver_complex.cpp     |  1 +
 test/eigensolver_generic.cpp     |  1 +
 test/eigensolver_selfadjoint.cpp | 10 ++++------
 test/inverse.cpp                 |  6 +++++-
 test/product_notemporary.cpp     |  5 +++--
 test/product_selfadjoint.cpp     |  9 ++++++++-
 test/product_syrk.cpp            |  5 +++--
 test/product_trmv.cpp            |  6 ++++--
 9 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index 33e32a322..9335270f4 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -380,10 +380,14 @@ void test_cholesky()
     CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) );
     CALL_SUBTEST_4( cholesky(Matrix3f()) );
     CALL_SUBTEST_5( cholesky(Matrix4d()) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+    
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);    
     CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 
   CALL_SUBTEST_4( cholesky_verify_assert<Matrix3f>() );
@@ -395,6 +399,5 @@ void test_cholesky()
   CALL_SUBTEST_9( LLT<MatrixXf>(10) );
   CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
   
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
   TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
 }
diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
index c9d8c0877..bf8d2deb0 100644
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp
@@ -108,6 +108,7 @@ void test_eigensolver_complex()
     CALL_SUBTEST_2( eigensolver(MatrixXcd(s,s)) );
     CALL_SUBTEST_3( eigensolver(Matrix<std::complex<float>, 1, 1>()) );
     CALL_SUBTEST_4( eigensolver(Matrix3f()) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
   CALL_SUBTEST_1( eigensolver_verify_assert(Matrix4cf()) );
   s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index 92d33f66a..c5441ac4e 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@@ -93,6 +93,7 @@ void test_eigensolver_generic()
     CALL_SUBTEST_1( eigensolver(Matrix4f()) );
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_2( eigensolver(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
 
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_2( eigensolver(MatrixXd(1,1)) );
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 935736328..7b0077a6d 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -154,15 +154,13 @@ void test_eigensolver_selfadjoint()
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
     CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
-    CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
-    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
-    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) );
     
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
+    CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
+    CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) );
     CALL_SUBTEST_9( selfadjointeigensolver(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
 
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 1e7b20958..b09989aca 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -102,12 +102,16 @@ void test_inverse()
     CALL_SUBTEST_3( inverse(Matrix3f()) );
     CALL_SUBTEST_4( inverse(Matrix4f()) );
     CALL_SUBTEST_4( inverse(Matrix<float,4,4,DontAlign>()) );
+    
     s = internal::random<int>(50,320); 
     CALL_SUBTEST_5( inverse(MatrixXf(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(25,100);
     CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     CALL_SUBTEST_7( inverse(Matrix4d()) );
     CALL_SUBTEST_7( inverse(Matrix<double,4,4,DontAlign>()) );
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 805cc8939..898f1d1cb 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -129,11 +129,12 @@ void test_product_notemporary()
   for(int i = 0; i < g_repeat; i++) {
     s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_1( product_notemporary(MatrixXf(s, s)) );
-    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( product_notemporary(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_3( product_notemporary(MatrixXcf(s,s)) );
-    s = internal::random<int>(16,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( product_notemporary(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
index 374e2393b..3d768aa7e 100644
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@@ -67,14 +67,21 @@ void test_product_selfadjoint()
     CALL_SUBTEST_1( product_selfadjoint(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( product_selfadjoint(Matrix<float, 2, 2>()) );
     CALL_SUBTEST_3( product_selfadjoint(Matrix3d()) );
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( product_selfadjoint(MatrixXcf(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_5( product_selfadjoint(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_6( product_selfadjoint(MatrixXd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_7( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
index 73c95000c..e10f0f2f2 100644
--- a/test/product_syrk.cpp
+++ b/test/product_syrk.cpp
@@ -125,11 +125,12 @@ void test_product_syrk()
     int s;
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp
index 4c3c435c2..57a202afc 100644
--- a/test/product_trmv.cpp
+++ b/test/product_trmv.cpp
@@ -78,12 +78,14 @@ void test_product_trmv()
     CALL_SUBTEST_1( trmv(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( trmv(Matrix<float, 2, 2>()) );
     CALL_SUBTEST_3( trmv(Matrix3d()) );
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_4( trmv(MatrixXcf(s,s)) );
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_5( trmv(MatrixXcd(s,s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
+    
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_6( trmv(Matrix<float,Dynamic,Dynamic,RowMajor>(s, s)) );
+    TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
-  TEST_SET_BUT_UNUSED_VARIABLE(s);
 }

From 6f4adc9e9428c383236d60004a64fddfbeaf254f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 11:40:06 +0100
Subject: [PATCH 08/18] Add missing install directives for arch/CUDA

---
 Eigen/src/Core/arch/CMakeLists.txt      | 8 ++++++--
 Eigen/src/Core/arch/CUDA/CMakeLists.txt | 6 ++++++
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 Eigen/src/Core/arch/CUDA/CMakeLists.txt

diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt
index 0db8c558d..42b0b486e 100644
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,5 +1,9 @@
-ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
-ADD_SUBDIRECTORY(NEON)
 ADD_SUBDIRECTORY(AVX)
+ADD_SUBDIRECTORY(CUDA)
 ADD_SUBDIRECTORY(Default)
+ADD_SUBDIRECTORY(NEON)
+ADD_SUBDIRECTORY(SSE)
+
+
+
diff --git a/Eigen/src/Core/arch/CUDA/CMakeLists.txt b/Eigen/src/Core/arch/CUDA/CMakeLists.txt
new file mode 100644
index 000000000..7ba28da7c
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_CUDA_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel
+)

From 548b7813805d9e314f97eb6f731d711df663a46b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 12:53:49 +0100
Subject: [PATCH 09/18] Fix bug #945: workaround MSVC warning

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 15bf04d1f..ce5494182 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1788,14 +1788,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
           kernel.packet[0] = dm0.loadPacket(k);
-          kernel.packet[1] = dm1.loadPacket(k);
-          kernel.packet[2] = dm2.loadPacket(k);
-          kernel.packet[3] = dm3.loadPacket(k);
+          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
+          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
+          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
-          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
-          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2]));
-          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
           count+=4*PacketSize;
         }
       }

From c7bb1e8ea8dfc984788d0cb77b82a90468393c2e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 15:19:23 +0100
Subject: [PATCH 10/18] Fix a regression when using OpenMP, and fix bug #714:
 the number of threads might be lower than the number of requested ones

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 25 ++++++++++++++++---
 Eigen/src/Core/products/Parallelizer.h        | 19 ++++++++------
 test/product_large.cpp                        |  3 +--
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 44e44b986..c38c12c31 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -217,8 +217,9 @@ struct gemm_functor
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
   {}
 
-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
   {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
     m_blocking.allocateA();
   }
 
@@ -276,7 +277,7 @@ class level3_blocking
 };
 
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
   : public level3_blocking<
       typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
       typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -299,7 +300,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
@@ -307,6 +308,9 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
     }
+    
+    void initParallel(Index, Index, Index, Index)
+    {}
 
     inline void allocateA() {}
     inline void allocateB() {}
@@ -331,7 +335,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(Index rows, Index cols, Index depth, int num_threads, bool l3_blocking)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
@@ -351,6 +355,19 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
     }
+    
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+      
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);      
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
 
     void allocateA()
     {
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 2b90abf8f..91d37a123 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -120,25 +120,28 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
     return func(0,rows, 0,cols);
 
   Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);
 
   if(transpose)
     std::swap(rows,cols);
-
-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads);
-  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
   
   ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
-
+  
   #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+    
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+  
     Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
 
     Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
     info[i].lhs_start = r0;
     info[i].lhs_length = actualBlockRows;
diff --git a/test/product_large.cpp b/test/product_large.cpp
index ffb8b7bf2..84c489580 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -64,8 +64,7 @@ void test_product_large()
 #endif
 
   // Regression test for bug 714:
-#ifdef EIGEN_HAS_OPENMP
-  std::cout << "Testing omp_set_dynamic(1)\n";
+#if defined EIGEN_HAS_OPENMP
   omp_set_dynamic(1);
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );

From 4a3e6c8be1d4752b3172a7e26631c4669e28dde7 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 18 Feb 2015 09:43:55 -0500
Subject: [PATCH 11/18] bug #958 - Allow testing specific blocking sizes

This is only a debugging/testing patch. It allows testing specific
product blocking sizes, typically to study the impact on performance.

Example usage:

int testk, testm, testn;
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K testk
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M testm
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N testn
#include <Eigen/Core>
---
 .../src/Core/products/GeneralBlockPanelKernel.h  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index ce5494182..f3fede441 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -84,6 +84,22 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
 void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  EIGEN_UNUSED_VARIABLE(num_threads);
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 16,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  k = std::min<SizeType>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+  if (k > kr) k -= k % kr;
+  m = std::min<SizeType>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+  if (m > mr) m -= m % mr;
+  n = std::min<SizeType>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+  if (n > nr) n -= n % nr;
+  return;
+#endif
   // Explanations:
   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed

From fc5c3e85e2a59a3b366970793195538969462a64 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 15:47:01 +0100
Subject: [PATCH 12/18] Fix bug #961: eigen-doc.tgz included part of itself.

---
 doc/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 46e5fc9d7..4d01a0424 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -100,7 +100,8 @@ add_custom_target(doc ALL
   COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html
   COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc
   COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz
-  COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc/eigen-doc.tgz eigen-doc
+  COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc
+  COMMAND ${CMAKE_COMMAND} -E rename eigen-doc.tgz eigen-doc/eigen-doc.tgz
   COMMAND ${CMAKE_COMMAND} -E rename eigen-doc html
   WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc)
 

From 63eb0f6fe6c7223c3bb2a2ea9495fddcc1e4b6f2 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 18 Feb 2015 15:49:05 +0100
Subject: [PATCH 13/18] Clean a bit computeProductBlockingSizes (use Index
 type, remove CEIL macro)

---
 .../Core/products/GeneralBlockPanelKernel.h   | 55 +++++++++----------
 Eigen/src/Core/util/Meta.h                    |  8 +++
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index ce5494182..dc679b3fe 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -79,18 +79,15 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
   * - the number of scalars that fit into a packet (when vectorization is enabled).
   *
   * \sa setCpuCacheSizes */
-#define CEIL(a, b) ((a)+(b)-1)/(b)
 
-template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
 
@@ -108,32 +105,32 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_
       nr = Traits::nr,
       nr_mask = (0xffffffff/nr)*nr
     };
-    SizeType k_cache = (l1-ksub)/kdiv;
+    Index k_cache = (l1-ksub)/kdiv;
     if (k_cache < k) {
       k = k_cache & k_mask;
-      eigen_assert(k > 0);
+      eigen_internal_assert(k > 0);
     }
 
-    SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
-    SizeType n_per_thread = CEIL(n, num_threads);
+    Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    Index n_per_thread = numext::div_ceil(n, num_threads);
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
-      eigen_assert(n_cache >= static_cast<SizeType>(nr));
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
       n = n_cache & nr_mask;
-      eigen_assert(n > 0);
+      eigen_internal_assert(n > 0);
     } else {
-      n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
+      n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
     }
 
     if (l3 > l2) {
       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
-      SizeType m_per_thread = CEIL(m, num_threads);
-      if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
+      Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
         m = m_cache & mr_mask;
-        eigen_assert(m > 0);
+        eigen_internal_assert(m > 0);
       } else {
-        m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
+        m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
       }
     }
   }
@@ -141,19 +138,19 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_
     // In unit tests we do not want to use extra large matrices,
     // so we reduce the block size to check the blocking strategy is not flawed
 #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-    n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-    m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
+    k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
+    n = std::min<Index>(n,3840/sizeof(RhsScalar));
+    m = std::min<Index>(m,3840/sizeof(RhsScalar));
 #else
-    k = std::min<SizeType>(k,24);
-    n = std::min<SizeType>(n,384/sizeof(RhsScalar));
-    m = std::min<SizeType>(m,384/sizeof(RhsScalar));
+    k = std::min<Index>(k,24);
+    n = std::min<Index>(n,384/sizeof(RhsScalar));
+    m = std::min<Index>(m,384/sizeof(RhsScalar));
 #endif
   }
 }
 
-template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
+template<typename LhsScalar, typename RhsScalar>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index f3bafd5af..3be9e6ca5 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -284,6 +284,14 @@ template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b =
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
+
 } // end namespace numext
 
 } // end namespace Eigen

From ee27d50633a676986fc0841dcf5d06bed0d4bd8b Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Wed, 18 Feb 2015 18:51:08 +0100
Subject: [PATCH 14/18] Fixed template parameter.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 12948a20c..c8a1dcced 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -91,11 +91,11 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     mr = Traits::mr,
     nr = Traits::nr
   };
-  k = std::min<SizeType>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
   if (k > kr) k -= k % kr;
-  m = std::min<SizeType>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+  m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
   if (m > mr) m -= m % mr;
-  n = std::min<SizeType>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+  n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
   if (n > nr) n -= n % nr;
   return;
 #endif

From 9bd8a4bab58231a1d3afe0dd43a7c72f217dfec1 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 18 Feb 2015 15:03:35 -0500
Subject: [PATCH 15/18] bug #955 - Implement a rotating kernel alternative in
 the 3px4 gebp path

This is substantially faster on ARM, where it's important to minimize the number of loads.

This is specific to the case where all packet types are of size 4. I made my best attempt to minimize how dirty this is... opinions welcome.

Eventually one could have a generic rotated kernel, but it would take some work to get there. Also, on sandy bridge, in my experience, it's not beneficial (even about 1% slower).
---
 Eigen/src/Core/GenericPacketMath.h            | 15 ++++
 Eigen/src/Core/arch/NEON/PacketMath.h         | 25 ++++++
 Eigen/src/Core/arch/SSE/PacketMath.h          | 23 +++++
 .../Core/products/GeneralBlockPanelKernel.h   | 86 +++++++++++++++----
 Eigen/src/Core/util/StaticAssert.h            |  3 +-
 5 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 74e1174ae..967a07df5 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -287,6 +287,21 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
 
+template<size_t offset, typename Packet>
+struct protate_impl
+{
+  static Packet run(const Packet& a) { return a; }
+};
+
+/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
+  * by the given offset, e.g. for offset == 1:
+  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
+  */
+template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
+{
+  EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
+  return offset ? protate_impl<offset, Packet>::run(a) : a;
+}
 
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 8149aed7f..e9af45f22 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
   a_hi = vget_high_s32(a_r64);
   return vcombine_s32(a_hi, a_lo);
 }
+
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vextq_f32(a, a, offset);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vextq_s32(a, a, offset);
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
 
@@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { retu
 
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
 
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vextq_f64(a, a, offset);
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b5a0ba2bc..3653783fd 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vec2d_swizzle1(a, offset, (offset + 1) % 2);
+  }
+};
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c8a1dcced..6a16aa661 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -771,7 +771,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     const Index peeled_kc  = depth & ~(pk-1);
     const Index prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
-    
+
+#if EIGEN_ARCH_ARM
+    const bool PreferRotatingKernel = true;
+#else
+    const bool PreferRotatingKernel = false;
+#endif
+
+    const bool UseRotatingKernel =
+                 PreferRotatingKernel &&
+                 Traits::LhsPacketSize == 4 &&
+                 Traits::RhsPacketSize == 4 &&
+                 Traits::ResPacketSize == 4;
+
     //---------- Process 3 * LhsProgress rows at once ----------
     // This corresponds to 3*LhsProgress x nr register blocks.
     // Usually, make sense only with FMA
@@ -818,7 +830,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             RhsPacket B_0, T0;
             LhsPacket A2;
 
-#define EIGEN_GEBGP_ONESTEP(K) \
+#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
+            do { \
+              if (UseRotatingKernel) { \
+                if (N == 0) { \
+                  B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
+                } else { \
+                  EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
+                  B_0 = protate<1>(B_0); \
+                } \
+              } else { \
+                traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \
+              } \
+            } while (false)
+
+#define EIGEN_GEBP_ONESTEP(K) \
             do { \
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
@@ -827,34 +853,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+              EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
               traits.madd(A0, B_0, C0, T0); \
               traits.madd(A1, B_0, C4, T0); \
               traits.madd(A2, B_0, C8, B_0); \
-              traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+              EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
               traits.madd(A0, B_0, C1, T0); \
               traits.madd(A1, B_0, C5, T0); \
               traits.madd(A2, B_0, C9, B_0); \
-              traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+              EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
               traits.madd(A0, B_0, C2,  T0); \
               traits.madd(A1, B_0, C6,  T0); \
               traits.madd(A2, B_0, C10, B_0); \
-              traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+              EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
               traits.madd(A0, B_0, C3 , T0); \
               traits.madd(A1, B_0, C7,  T0); \
               traits.madd(A2, B_0, C11, B_0); \
               EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
             } while(false)
-        
+
             internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
 
             blB += pk*4*RhsProgress;
             blA += pk*3*Traits::LhsProgress;
@@ -866,12 +892,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             RhsPacket B_0, T0;
             LhsPacket A2;
-            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 3*Traits::LhsProgress;
           }
-  #undef EIGEN_GEBGP_ONESTEP
-  
+#undef EIGEN_GEBP_ONESTEP
+
+          if (UseRotatingKernel) {
+            #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
+              do { \
+                PacketBlock<ResPacket> resblock; \
+                resblock.packet[0] = res0; \
+                resblock.packet[1] = res1; \
+                resblock.packet[2] = res2; \
+                resblock.packet[3] = res3; \
+                ptranspose(resblock); \
+                resblock.packet[3] = protate<1>(resblock.packet[3]); \
+                resblock.packet[2] = protate<2>(resblock.packet[2]); \
+                resblock.packet[1] = protate<3>(resblock.packet[1]); \
+                ptranspose(resblock); \
+                res0 = resblock.packet[0]; \
+                res1 = resblock.packet[1]; \
+                res2 = resblock.packet[2]; \
+                res3 = resblock.packet[3]; \
+              } while (false)
+            
+            EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
+            EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
+            EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
+          }
+
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 7538a0633..5e16b775b 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -93,7 +93,8 @@
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
         OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
         IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH
+        STORAGE_LAYOUT_DOES_NOT_MATCH,
+        ROTATION_BY_ILLEGAL_OFFSET
       };
     };
 

From 0ed00d543884e6cdd5a406789cb19f1b453db809 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 18 Feb 2015 15:05:01 -0500
Subject: [PATCH 16/18] remove a newly introduced redundant typedef - sorry.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 6a16aa661..15f3d869e 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -83,9 +83,10 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
 template<typename LhsScalar, typename RhsScalar, int KcFactor>
 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
   EIGEN_UNUSED_VARIABLE(num_threads);
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
   enum {
     kr = 16,
     mr = Traits::mr,
@@ -99,6 +100,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   if (n > nr) n -= n % nr;
   return;
 #endif
+
   // Explanations:
   // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
   // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
@@ -108,7 +110,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
 
   if (num_threads > 1) {
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
     typedef typename Traits::ResScalar ResScalar;
     enum {
       kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),

From db05f2d01e6e5d8bcd67643ed2836028071d967f Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 18 Feb 2015 15:43:52 -0500
Subject: [PATCH 17/18] rotating kernel: avoid compiling anything outside of
 ARM

---
 .../Core/products/GeneralBlockPanelKernel.h   | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 15f3d869e..ccd906540 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -773,18 +773,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     const Index prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
 
-#if EIGEN_ARCH_ARM
-    const bool PreferRotatingKernel = true;
-#else
-    const bool PreferRotatingKernel = false;
-#endif
-
-    const bool UseRotatingKernel =
-                 PreferRotatingKernel &&
-                 Traits::LhsPacketSize == 4 &&
-                 Traits::RhsPacketSize == 4 &&
-                 Traits::ResPacketSize == 4;
-
     //---------- Process 3 * LhsProgress rows at once ----------
     // This corresponds to 3*LhsProgress x nr register blocks.
     // Usually, make sense only with FMA
@@ -824,13 +812,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
           prefetch(&blB[0]);
           LhsPacket A0, A1;
-          
+
+#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM
+
+#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
+          static const bool UseRotatingKernel =
+            Traits::LhsPacketSize == 4 &&
+            Traits::RhsPacketSize == 4 &&
+            Traits::ResPacketSize == 4;
+#endif
+
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
             RhsPacket B_0, T0;
             LhsPacket A2;
 
+#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \
+            traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0);
+
+#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
 #define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
             do { \
               if (UseRotatingKernel) { \
@@ -841,9 +842,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                   B_0 = protate<1>(B_0); \
                 } \
               } else { \
-                traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \
+                EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \
               } \
             } while (false)
+#else
+#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
+            EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N)
+#endif
 
 #define EIGEN_GEBP_ONESTEP(K) \
             do { \
@@ -897,8 +902,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             blB += 4*RhsProgress;
             blA += 3*Traits::LhsProgress;
           }
-#undef EIGEN_GEBP_ONESTEP
 
+#undef EIGEN_GEBP_ONESTEP
+#undef EIGEN_GEBP_ONESTEP_LOADRHS
+#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING
+
+#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
           if (UseRotatingKernel) {
             #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
               do { \
@@ -922,6 +931,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
             EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
           }
+#endif
 
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);

From 829dddd0fdb1bfd4942ff35fe7dfd4ccdeddc97e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 19 Feb 2015 15:18:37 +0100
Subject: [PATCH 18/18] Add support for C++11 result_of/lambdas

---
 Eigen/src/Core/util/Macros.h |  5 +++++
 Eigen/src/Core/util/Meta.h   | 32 ++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index e607cdd12..aaea9f035 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -382,6 +382,11 @@
   #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
 
+// Does the compiler support result_of?
+#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
+#define EIGEN_HAS_STD_RESULT_OF 1
+#endif
+
 // Does the compiler support variadic templates?
 #if __cplusplus > 199711L
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 3be9e6ca5..674cd8f97 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -165,6 +165,7 @@ template<typename T> struct result_of {};
 struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
+struct has_cxx_eleven_result {int a[4];};
 
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
 struct unary_result_of_select {typedef ArgType type;};
@@ -175,13 +176,22 @@ struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typed
 template<typename Func, typename ArgType>
 struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
 
+#ifdef EIGEN_HAS_STD_RESULT_OF
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_cxx_eleven_result)> {typedef typename std::result_of<Func(ArgType)>::type type;};
+#endif
+
 template<typename Func, typename ArgType>
 struct result_of<Func(ArgType)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+#ifdef EIGEN_HAS_STD_RESULT_OF
+    template<typename T>
+    static has_cxx_eleven_result  testFunctor(T const *, typename std::result_of<T(ArgType)>::type const * = 0);
+#endif
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -199,13 +209,23 @@ template<typename Func, typename ArgType0, typename ArgType1>
 struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
 {typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
 
+#ifdef EIGEN_HAS_STD_RESULT_OF
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_cxx_eleven_result)>
+{typedef typename std::result_of<Func(ArgType0, ArgType1)>::type type;};
+#endif
+
 template<typename Func, typename ArgType0, typename ArgType1>
 struct result_of<Func(ArgType0,ArgType1)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+#ifdef EIGEN_HAS_STD_RESULT_OF
+    template<typename T>
+    static has_cxx_eleven_result  testFunctor(T const *, typename std::result_of<T(ArgType0, ArgType1)>::type const * = 0);
+#endif
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};