From a1f1e1e51df316d1b37733770f5e7ab17006113a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 6 Apr 2015 10:41:39 -0700
Subject: [PATCH 1/7] Fixed the order of 2 #includes

---
 unsupported/Eigen/CXX11/Tensor               |  2 +-
 unsupported/test/cxx11_tensor_index_list.cpp | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 200bcf966..ae6c3fe7e 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -49,8 +49,8 @@
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index c4d4f244f..4ce5add32 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -255,6 +255,17 @@ static void test_mixed_index_list()
   VERIFY_IS_APPROX(result3(0), expected);
 }
 
+
+static void test_dim_check()
+{
+  Eigen::IndexList<Eigen::type2index<1>, int> dim1;
+  dim1.set(1, 2);
+  Eigen::IndexList<Eigen::type2index<1>, int> dim2;
+  dim2.set(1, 2);
+  VERIFY(dimensions_match(dim1, dim2));
+}
+
+
 #endif
 
 void test_cxx11_tensor_index_list()
@@ -264,5 +275,6 @@ void test_cxx11_tensor_index_list()
   CALL_SUBTEST(test_type2index_list());
   CALL_SUBTEST(test_dynamic_index_list());
   CALL_SUBTEST(test_mixed_index_list());
+  CALL_SUBTEST(test_dim_check());
 #endif
 }

From 1de49ef4c2d96acc1c96628fa52e2330cf54dc19 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 7 Apr 2015 10:44:13 -0700
Subject: [PATCH 2/7] Fixed a bug when chipping tensors laid out in row major
 order.

---
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   |  6 ++-
 unsupported/test/cxx11_tensor_chipping.cpp    | 52 ++++++++++++++-----
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index dc9586cbc..3b99ef069 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -157,6 +157,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     eigen_assert(NumInputDims > m_dim.actualDim());
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
     int j = 0;
     for (int i = 0; i < NumInputDims; ++i) {
       if (i != m_dim.actualDim()) {
@@ -246,7 +248,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
     Scalar* result = m_impl.data();
-    if (m_dim.actualDim() == NumDims && result) {
+    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
+        result) {
       return result + m_inputOffset;
     } else {
       return NULL;
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index d83417872..bfc2bad18 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -340,11 +340,9 @@ static void test_chip_as_lvalue()
   }
 }
 
-
-template<int DataLayout>
-static void test_chip_raw_data()
+static void test_chip_raw_data_col_major()
 {
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<float, 5, ColMajor> tensor(2,3,5,7,11);
   tensor.setRandom();
 
   typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4;
@@ -353,12 +351,7 @@ static void test_chip_raw_data()
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
         for (int l = 0; l < 7; ++l) {
-          int chip_index;
-          if (DataLayout == ColMajor) {
-            chip_index = i + 2 * (j + 3 * (k + 5 * l));
-          } else {
-            chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i)));
-          }
+          int chip_index = i + 2 * (j + 3 * (k + 5 * l));
           VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
         }
       }
@@ -382,6 +375,41 @@ static void test_chip_raw_data()
   VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
 }
 
+static void test_chip_raw_data_row_major()
+{
+  Tensor<float, 5, RowMajor> tensor(11,7,5,3,2);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.template chip<0>(3)), DefaultDevice> Evaluator0;
+  auto chip = Evaluator0(tensor.template chip<0>(3), DefaultDevice());
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          int chip_index = l + 2 * (k + 3 * (j + 5 * i));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<4>(0)), DefaultDevice> Evaluator4;
+  auto chip4 = Evaluator4(tensor.template chip<4>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
+}
+
 void test_cxx11_tensor_chipping()
 {
   CALL_SUBTEST(test_simple_chip<ColMajor>());
@@ -392,6 +420,6 @@ void test_cxx11_tensor_chipping()
   CALL_SUBTEST(test_chip_in_expr<RowMajor>());
   CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
   CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
-  CALL_SUBTEST(test_chip_raw_data<ColMajor>());
-  CALL_SUBTEST(test_chip_raw_data<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data_col_major());
+  CALL_SUBTEST(test_chip_raw_data_row_major());
 }

From 0e9753c8dfffd8e20a445d3f150b8180c0815419 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 7 Apr 2015 14:03:21 -0400
Subject: [PATCH 3/7] Fix compiler flags on Android/ARM:  - generate
 position-independent code (PIE), a requirement to run binaries on Android
 5.0+ devices;  - correctly handle EIGEN_TEST_FMA + EIGEN_TEST_NEON to pass
 -mfpu=neon-vfpv4.

---
 CMakeLists.txt | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c1ae428e..a28ad07d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,11 @@ if(NOT MSVC)
   else()
     ei_add_cxx_compiler_flag("-ansi")
   endif()
+
+  if(ANDROID_NDK)
+    ei_add_cxx_compiler_flag("-pie")
+    ei_add_cxx_compiler_flag("-fPIE")
+  endif()
   
   set(CMAKE_REQUIRED_FLAGS "")
 
@@ -208,7 +213,7 @@ if(NOT MSVC)
   endif()
 
   option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
-  if(EIGEN_TEST_FMA)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
     message(STATUS "Enabling FMA in tests/examples")
   endif()
@@ -227,7 +232,12 @@ if(NOT MSVC)
 
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp")
+    if(EIGEN_TEST_FMA)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp")
     message(STATUS "Enabling NEON in tests/examples")
   endif()
 

From d7f51feb0773cc3843ea8c29c605d4eea4bda4ac Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Tue, 7 Apr 2015 15:13:55 -0400
Subject: [PATCH 4/7] bug #992: don't select a 3p GEMM path with
 non-vectorizable scalar types, this hits unsupported paths in symm/triangular
 products code

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 428527820..2b4c1242f 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -380,11 +380,12 @@ public:
     nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
-    mr = 3*LhsPacketSize,
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
-    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+    mr = default_mr,
 #endif
     
     LhsProgress = LhsPacketSize,

From 0eb220c00d9773c29c7d169ad0e20745b0ef21bb Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 8 Apr 2015 09:25:34 +0200
Subject: [PATCH 5/7] add a note on bug #992

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 2b4c1242f..24623963b 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -383,6 +383,8 @@ public:
     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
     mr = default_mr,

From 5401fbcc50747583b0d47e195f23f988f7dfac5e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 9 Apr 2015 16:44:10 -0700
Subject: [PATCH 6/7] Improved the blocking strategy to speedup multithreaded
 tensor contractions.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 24623963b..320f96a39 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       nr = Traits::nr,
       nr_mask = (0xffffffff/nr)*nr
     };
-    Index k_cache = (l1-ksub)/kdiv;
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
     if (k_cache < k) {
       k = k_cache & k_mask;
       eigen_internal_assert(k > 0);
     }
 
-    Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
-    Index n_per_thread = numext::div_ceil(n, num_threads);
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
       eigen_internal_assert(n_cache >= static_cast<Index>(nr));
@@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
 
     if (l3 > l2) {
       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
-      Index m_per_thread = numext::div_ceil(m, num_threads);
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
         m = m_cache & mr_mask;
         eigen_internal_assert(m > 0);

From 761691f18d59790fde24174503c6bdf3d254831b Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Mon, 13 Apr 2015 17:15:00 +0200
Subject: [PATCH 7/7] Make conversion from 0 to Scalar explicit (issue reported
 by Brad Bell)

---
 Eigen/src/SparseCore/TriangularSolver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index fd1a55bc6..8872012db 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
       for(Index i=lhs.rows()-1 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,col);
-        Scalar l_ii = 0;
+        Scalar l_ii(0);
         LhsIterator it(lhsEval, i);
         while(it && it.index()<i)
           ++it;