From f8fbb3f9a642371d3798e8d6e9638d878e00d945 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Feb 2015 22:22:37 -0800
Subject: [PATCH 01/18] Fixed several compilation warnings reported by clang

---
 .../CXX11/src/Tensor/TensorBroadcasting.h     |  4 ++--
 .../src/Tensor/TensorContractionThreadPool.h  | 22 +++++++++----------
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  1 -
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   |  2 +-
 5 files changed, 14 insertions(+), 17 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 5790e19d6..055a7d407 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -249,7 +249,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       innermostLoc = index;
     } else {
       if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
-        eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
         innermostLoc = 0;
       } else {
         innermostLoc = index % m_impl.dimensions()[0];
@@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       innermostLoc = index;
     } else {
       if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
-        eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
         innermostLoc = 0;
       } else {
         innermostLoc = index % m_impl.dimensions()[NumDims-1];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 8b87f1045..9259c864e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -174,8 +174,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     OutputMapper output(buffer, m);
 
-    LhsPacker pack_lhs;
-
     // compute block sizes (which depend on number of threads)
     const Index num_threads = this->m_device.numThreads();
     Index mc = m;
@@ -190,8 +188,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index k_blocks = CEIL_DIV(k, kc);
     const Index n_blocks = CEIL_DIV(n, nc);
     const Index m_blocks = CEIL_DIV(m, mc);
-    const int sizeA = mc * kc;
-    const int sizeB = kc * nc;
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
 
     /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
     cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
@@ -228,7 +226,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index num_kernel_promises = num_threads * n_blocks;
     std::vector<Promise> kernel_promises(num_kernel_promises);
     std::vector<Future> kernel_futures(num_kernel_promises);
-    for (int i = 0; i < kernel_promises.size(); ++i) {
+    for (std::size_t i = 0; i < kernel_promises.size(); ++i) {
       kernel_promises[i].set_value();
       kernel_futures[i] = kernel_promises[i].get_future();
     }
@@ -239,16 +237,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
 
       for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
-        const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+        const Index num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
 
         for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
           const Index m_start = mt_block_idx * mc;
           const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
           eigen_assert(actual_mc > 0);
 
-          int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
           for (int i = 0; i < n_blocks; ++i) {
-            int future_id = (blockAId * n_blocks + i);
+            Index future_id = (blockAId * n_blocks + i);
             wait_until_ready(&kernel_futures[future_id]);
             kernel_promises[future_id] = Promise();
             kernel_futures[future_id] = kernel_promises[future_id].get_future();
@@ -277,9 +275,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           // first make sure the previous kernels are all done before overwriting rhs. Also wait if
           // we're going to start new k. In both cases need_to_pack is true.
           if (need_to_pack) {
-            for (int i = num_blocks; i < num_threads; ++i) {
-              int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
-              int future_id = (blockAId * n_blocks + n_block_idx);
+            for (Index i = num_blocks; i < num_threads; ++i) {
+              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+              Index future_id = (blockAId * n_blocks + n_block_idx);
               wait_until_ready(&kernel_futures[future_id]);
             }
           }
@@ -361,7 +359,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
       const Index m_base_start = arg.m + arg.mc*mt_block_idx;
       if (m_base_start < arg.max_m) {
-        int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
 
         wait_until_ready(&(*arg.lhs_futures)[blockAId]);
         const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 2ad52b2f9..5e805fd95 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -230,7 +230,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 
   EIGEN_DEVICE_FUNC DSizes() {
-    for (int i = 0 ; i < NumDims; ++i) {
+    for (std::size_t i = 0 ; i < NumDims; ++i) {
       (*this)[i] = 0;
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 05ac9bd2f..1eb37bf1c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -131,7 +131,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
       const Index numblocks = size / blocksize;
 
-      Index i = 0;
       std::vector<Future> results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index a93f48ccb..01ba0a80f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
-    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
     }
 

From 27f3fb2bcc8ea06f3f471560fc1c87210cd0c62b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Feb 2015 22:54:20 -0800
Subject: [PATCH 02/18] Fixed another clang warning

---
 .../Eigen/CXX11/src/Tensor/TensorIntDiv.h     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 2714117ab..4b22c64de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -28,6 +28,21 @@ namespace Eigen {
 
 namespace internal {
 
+namespace {
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  {
+#ifndef __CUDA_ARCH__
+    if (sizeof(T) == 8) {
+      return __builtin_clzl(val);
+    }
+    return __builtin_clz(val);
+#else
+    return __clz(val);
+#endif
+  }
+};
+
 template <typename T>
 struct TensorIntDivisor {
  public:
@@ -44,11 +59,7 @@ struct TensorIntDivisor {
     eigen_assert(divider <= (1<<(N-1)) - 1);
 
     // fast ln2
-#ifndef __CUDA_ARCH__
-    const int leading_zeros = __builtin_clz(divider);
-#else
-    const int leading_zeros = __clz(divider);
-#endif
+    const int leading_zeros = count_leading_zeros(divider);
     const int log_div = N - (leading_zeros+1);
 
     multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;

From bffb6bdf455097ac5a6f98eb4bb549166e318989 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Feb 2015 23:54:43 -0800
Subject: [PATCH 03/18] Made TensorIntDiv.h compile with MSVC

---
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 4b22c64de..535abd6fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,14 +29,16 @@ namespace Eigen {
 namespace internal {
 
 namespace {
+  // Note: result is undefined if val == 0
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
   {
 #ifndef __CUDA_ARCH__
-    if (sizeof(T) == 8) {
-      return __builtin_clzl(val);
-    }
-    return __builtin_clz(val);
+    return __builtin_clz(static_cast<uint32_t>(val));
+#elif EIGEN_COMP_MSVC
+    DWORD leading_zero = 0;
+    _BitScanReverse( &leading_zero, value);
+    return 31 - leading_zero;
 #else
     return __clz(val);
 #endif

From 400becc591359596b1236ee8d9f2c99c12961a42 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 15:57:06 +0100
Subject: [PATCH 04/18] Fix typos in block-size testing code, and set peeling
 on k to 8.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 0890bc690..eb095966b 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
   EIGEN_UNUSED_VARIABLE(num_threads);
   enum {
-    kr = 16,
+    kr = 8,
     mr = Traits::mr,
     nr = Traits::nr
   };
   k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
   if (k > kr) k -= k % kr;
-  m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
   if (m > mr) m -= m % mr;
-  n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
   if (n > nr) n -= n % nr;
   return;
 #endif

From a8ad8887bf80869eb216f6f937a559290984a3f7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 16:04:35 +0100
Subject: [PATCH 05/18] Implement a more generic blocking-size selection
 algorithm. See explanations inlines. It performs extremely well on Haswell.
 The main issue is to reliably and quickly find the actual cache size to be
 used for our 2nd level of blocking, that is: max(l2,l3/nb_core_sharing_l3)

---
 .../Core/products/GeneralBlockPanelKernel.h   | 88 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index eb095966b..65310e637 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -155,9 +155,91 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     // In unit tests we do not want to use extra large matrices,
     // so we reduce the block size to check the blocking strategy is not flawed
 #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-    n = std::min<Index>(n,3840/sizeof(RhsScalar));
-    m = std::min<Index>(m,3840/sizeof(RhsScalar));
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if(std::max(k,std::max(m,n))<48)
+      return;
+    
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+    
+    // ---- 1st level of blocking on L1, yields kc ----
+    
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+                        
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+    
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+    
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5, leading to:
+    const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = 576;
+      }
+      
+      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
 #else
     k = std::min<Index>(k,24);
     n = std::min<Index>(n,384/sizeof(RhsScalar));

From a46061ab7bf290bd918716e56bf365d6939ff081 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 16:12:58 +0100
Subject: [PATCH 06/18] Make perf_monitoring/gemm script more flexible:   -
 skip existing dataset   - add a "-up" option to recompute the dataset (see
 script header)   - allow to specify a filename prefix

---
 bench/perf_monitoring/gemm/run_gemm.sh | 97 +++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 19 deletions(-)

diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh
index 4d87ff656..d3a9fadc9 100755
--- a/bench/perf_monitoring/gemm/run_gemm.sh
+++ b/bench/perf_monitoring/gemm/run_gemm.sh
@@ -1,5 +1,25 @@
 #!/bin/bash
 
+# Examples of environment variables to be set:
+#   PREFIX="haswell-fma-"
+#   CXX_FLAGS="-mfma"
+
+# Options:
+#   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+
+
+if echo "$*" | grep '\-up' > /dev/null; then
+  update=true
+else
+  update=false
+fi
+
+if [ $update == true ]; then
+ echo "(Re-)Compute all changesets and keep bests"
+else
+ echo "Skip previously computed changesets"
+fi
+
 if [ ! -d "eigen_src" ]; then
   hg clone https://bitbucket.org/eigen/eigen eigen_src
 fi
@@ -8,9 +28,32 @@ if [ ! -z '$CXX' ]; then
   CXX=g++
 fi
 
-rm sgemm.out
-rm dgemm.out
-rm cgemm.out
+function make_backup
+{
+  if [ -f "$1.out" ]; then
+    mv "$1.out" "$1.backup"
+  fi
+}
+
+function merge
+{
+  count1=`echo $1 |  wc -w`
+  count2=`echo $2 |  wc -w`
+  
+  if [ $count1 == $count2 ]; then
+    a=( $1 ); b=( $2 )
+    res=""
+    for (( i=0 ; i<$count1 ; i++ )); do
+      ai=${a[$i]}; bi=${b[$i]}
+      tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
+      res="$res $tmp"
+    done
+    echo $res
+
+  else
+    echo $1
+  fi
+}
 
 function test_current 
 {
@@ -18,16 +61,32 @@ function test_current
   scalar=$2
   name=$3
   
-  if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
-    res=`./$name`
-    echo $res
-    echo "$rev $res" >> $name.out
+  prev=`grep $rev "$name.backup" | cut -c 14-`
+  res=$prev
+  count_rev=`echo $prev |  wc -w`
+  count_ref=`cat "settings.txt" |  wc -l`
+  if [ $update == true ] || [ $count_rev != $count_ref ]; then
+    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
+      curr=`./$name`
+      echo merge $prev
+      echo with  $curr
+      res=`merge "$curr" "$prev"`
+      echo $res
+      echo "$rev $res" >> $name.out
+    else
+      echo "Compilation failed, skip rev $rev"
+    fi
   else
-    echo "Compilation failed, skip rev $rev"
+    echo "Skip existing results for $rev / $name"
+    echo "$rev $res" >> $name.out
   fi
 }
 
-while read rev
+make_backup $PREFIX"sgemm"
+make_backup $PREFIX"dgemm"
+make_backup $PREFIX"cgemm"
+
+cut -f1 -d"#" < changesets.txt | while read rev
 do
   if [ ! -z '$rev' ]; then
     echo "Testing rev $rev"
@@ -36,27 +95,27 @@ do
     actual_rev=`hg identify | cut -f1 -d' '`
     cd ..
     
-    test_current $actual_rev float                  sgemm
-    test_current $actual_rev double                 dgemm
-    test_current $actual_rev "std::complex<double>" cgemm
+    test_current $actual_rev float                  $PREFIX"sgemm"
+    test_current $actual_rev double                 $PREFIX"dgemm"
+    test_current $actual_rev "std::complex<double>" $PREFIX"cgemm"
   fi
   
-done < changesets.txt
+done
 
 echo "Float:"
-cat sgemm.out
+cat $PREFIX"sgemm.out"
 echo ""
 
 echo "Double:"
-cat dgemm.out
+cat $PREFIX"dgemm.out"
 echo ""
 
 echo "Complex:"
-cat cgemm.out
+cat $PREFIX"cgemm.out"
 echo ""
 
-./make_plot.sh sgemm
-./make_plot.sh dgemm
-./make_plot.sh cgemm
+./make_plot.sh $PREFIX"sgemm"
+./make_plot.sh $PREFIX"dgemm"
+./make_plot.sh $PREFIX"cgemm"
 
 

From 2e9cb06a87213138618487e62d7893ad3a9b4546 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 16:13:33 +0100
Subject: [PATCH 07/18] Update changeset list to be checked by
 perf_monitoring/gemm.

---
 bench/perf_monitoring/gemm/changesets.txt | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index 291fd55b4..f19b4287d 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -18,8 +18,22 @@ before-evaluators
 6334:f6a45e5b8b7c
 6639:c9121c60b5c7
 6655:06f163b5221f
-6677:700e023044e7
+6677:700e023044e7   # FMA has been wrongly disabled
 6681:11d31dafb0e3
-6844:039efd86b75c
-6845:7333ed40c6ef
-6911:6192dd812d84
\ No newline at end of file
+6699:5e6e8e10aad1   # merge default to tensors
+6726:ff2d2388e7b9   # merge default to tensors
+6742:0cbd6195e829   # merge default to tensors
+6747:853d2bafeb8f   # Generalized the gebp apis
+6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
+6781:9cc5a931b2c6   # generalized gemv
+6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
+6844:039efd86b75c   # merge tensor
+6845:7333ed40c6ef   # change prefetching in gebp
+6856:b5be5e10eb7f   # merge index conversion
+6893:c3a64aba7c70   # clean blocking size computation
+6898:6fb31ebe6492   # rotating kernel for ARM
+6899:877facace746   # rotating kernel for ARM only
+6904:c250623ae9fa   # result_of
+6921:915f1b1fc158   # fix prefetching change for ARM
+6923:9ff25f6dacc6   # prefetching
+6933:52572e60b5d3   # blocking size strategy
\ No newline at end of file

From 4ec3f04b3ab42a595b99bb2902758f7d748ab80c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 17:00:36 +0100
Subject: [PATCH 08/18] Make sure that the block size computation is tested by
 our unit test.

---
 .../Core/products/GeneralBlockPanelKernel.h   | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 65310e637..06c7f362e 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -153,8 +153,13 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   }
   else {
     // In unit tests we do not want to use extra large matrices,
-    // so we reduce the block size to check the blocking strategy is not flawed
-#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 4*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+    
     // Early return for small problems because the computation below are time consuming for small problems.
     // Perhaps it would make more sense to consider k*n*m??
     // Note that for very tiny problem, this function should be bypassed anyway
@@ -195,7 +200,13 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
     //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
     // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
     // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
     const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+    
+    
     
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
@@ -240,11 +251,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
       m = (m%mc)==0 ? mc
                     : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
     }
-#else
-    k = std::min<Index>(k,24);
-    n = std::min<Index>(n,384/sizeof(RhsScalar));
-    m = std::min<Index>(m,384/sizeof(RhsScalar));
-#endif
   }
 }
 

From bcf9bb5c1f1834487f10721abc909be4a62574d1 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 26 Feb 2015 17:01:33 +0100
Subject: [PATCH 09/18] Avoid packing rhs multiple-times when blocking on the
 lhs only.

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c38c12c31..c76f48154 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth,
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
     for(Index i2=0; i2<rows; i2+=mc)
@@ -188,7 +190,8 @@ static void run(Index rows, Index cols, Index depth,
           // We pack the rhs's block into a sequential chunk of memory (L2 caching)
           // Note that this block will be read a very high number of times, which is equal to the number of
           // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-          pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
           
           // Everything is packed, we can now call the panel * block kernel:
           gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);

From 2fffe69b1be0b4448c5105edf4aeac22937ae5dc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 26 Feb 2015 09:27:53 -0800
Subject: [PATCH 10/18] Added missing copy constructor

---
 unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index c119b30e2..054ecf7b5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -85,6 +85,15 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
+    {
+      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)

From 57154fdb32a4853bff458f8014b037d5e41b9858 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 26 Feb 2015 11:13:42 -0800
Subject: [PATCH 11/18] Can now use the tensor 'reverse' operation as a lvalue

---
 .../Eigen/CXX11/src/Tensor/TensorBase.h       |  5 +
 .../Eigen/CXX11/src/Tensor/TensorReverse.h    | 93 ++++++++++++++++---
 unsupported/test/cxx11_tensor_reverse.cpp     | 35 +++++--
 3 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index cfcf18e8e..201b0fc9e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -549,6 +549,11 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
     chip(const Index offset, const Index dim) const {
       return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
     }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReverseOp<const ReverseDimensions, Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+    }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
     shuffle(const Shuffle& shuffle) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index ad21e966b..16bef2ad3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -49,12 +49,9 @@ struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
 
 }  // end namespace internal
 
-
-
-
 template<typename ReverseDimensions, typename XprType>
 class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
-                                          XprType>, ReadOnlyAccessors>
+                                          XprType>, WriteAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
@@ -67,8 +64,8 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
                                                                     StorageKind;
   typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr,
-                                          const ReverseDimensions& reverse_dims)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+      const XprType& expr, const ReverseDimensions& reverse_dims)
       : m_xpr(expr), m_reverse_dims(reverse_dims) {}
 
     EIGEN_DEVICE_FUNC
@@ -78,12 +75,30 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
   protected:
     typename XprType::Nested m_xpr;
     const ReverseDimensions m_reverse_dims;
 };
 
-
 // Eval as rvalue
 template<typename ReverseDimensions, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
@@ -134,8 +149,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     m_impl.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
+      Index index) const {
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -152,7 +167,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       } else {
         inputIndex += index;
       }
-      return m_impl.coeff(inputIndex);
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
         Index idx = index / m_strides[i];
@@ -167,8 +181,13 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       } else {
         inputIndex += index;
       }
-      return m_impl.coeff(inputIndex);
     }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
+      Index index) const  {
+    return m_impl.coeff(reverseIndex(index));
   }
 
   template<int LoadMode>
@@ -199,9 +218,57 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   ReverseDimensions m_reverse;
 };
 
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                             Device> {
+  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+                          Device> Base;
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_impl.coeffRef(this->reverseIndex(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x) {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    // This code is pilfered from TensorMorphing.h
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+
+};
 
 
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index 4c0be35da..f96c21fa3 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -94,7 +94,7 @@ static void test_simple_reverse()
 
 
 template <int DataLayout>
-static void test_expr_reverse()
+static void test_expr_reverse(bool LValue)
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   tensor.setRandom();
@@ -105,9 +105,12 @@ static void test_expr_reverse()
   dim_rev[2] = false;
   dim_rev[3] = true;
 
-
-  Tensor<float, 4, DataLayout> expected;
-  expected = tensor.reverse(dim_rev);
+  Tensor<float, 4, DataLayout> expected(2, 3, 5, 7);
+  if (LValue) {
+    expected.reverse(dim_rev) = tensor;
+  } else {
+    expected = tensor.reverse(dim_rev);
+  }
 
   Tensor<float, 4, DataLayout> result(2,3,5,7);
 
@@ -117,8 +120,13 @@ static void test_expr_reverse()
   array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
 
   for (int i = 0; i < 5; ++i) {
-    result.slice(dst_slice_start, dst_slice_dim) =
-        tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    if (LValue) {
+      result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+          tensor.slice(src_slice_start, src_slice_dim);
+    } else {
+      result.slice(dst_slice_start, dst_slice_dim) =
+          tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
     src_slice_start[2] += 1;
     dst_slice_start[2] += 1;
   }
@@ -141,8 +149,13 @@ static void test_expr_reverse()
   dst_slice_start[2] = 0;
   result.setRandom();
   for (int i = 0; i < 5; ++i) {
-    result.slice(dst_slice_start, dst_slice_dim) =
-        tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     if (LValue) {
+       result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+           tensor.slice(dst_slice_start, dst_slice_dim);
+     } else {
+       result.slice(dst_slice_start, dst_slice_dim) =
+           tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     }
     dst_slice_start[2] += 1;
   }
 
@@ -162,6 +175,8 @@ void test_cxx11_tensor_reverse()
 {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
-  CALL_SUBTEST(test_expr_reverse<ColMajor>());
-  CALL_SUBTEST(test_expr_reverse<RowMajor>());
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
 }

From f074bb4b5f6ddf0a67348365499a5c5c335a5e17 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 26 Feb 2015 11:14:23 -0800
Subject: [PATCH 12/18] Fixed another compilation problem with TensorIntDiv.h

---
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 535abd6fd..11c7ce443 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -33,17 +33,17 @@ namespace {
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
   {
-#ifndef __CUDA_ARCH__
-    return __builtin_clz(static_cast<uint32_t>(val));
+#ifdef __CUDA_ARCH__
+    return __clz(val);
 #elif EIGEN_COMP_MSVC
     DWORD leading_zero = 0;
     _BitScanReverse( &leading_zero, value);
     return 31 - leading_zero;
 #else
-    return __clz(val);
+    return __builtin_clz(static_cast<uint32_t>(val));
 #endif
   }
-};
+}
 
 template <typename T>
 struct TensorIntDivisor {

From b7fc8746e07556404672b4da2d53dd323c441634 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 27 Feb 2015 10:01:59 -0500
Subject: [PATCH 13/18] Replace a static assert by a runtime one, fixes the
 build of unit tests on ARM Also safely assert in the non-implemented path
 that should never be taken in practice, and would return wrong results.

---
 Eigen/src/Core/GenericPacketMath.h | 7 +++++--
 Eigen/src/Core/util/StaticAssert.h | 3 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 967a07df5..afdd6f71e 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -290,7 +290,11 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 template<size_t offset, typename Packet>
 struct protate_impl
 {
-  static Packet run(const Packet& a) { return a; }
+  static Packet run(const Packet& a)
+  {
+    eigen_assert(false && "unimplemented");
+    return a;
+  }
 };
 
 /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
@@ -299,7 +303,6 @@ struct protate_impl
   */
 template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
 {
-  EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
   return offset ? protate_impl<offset, Packet>::run(a) : a;
 }
 
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 5e16b775b..7538a0633 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -93,8 +93,7 @@
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
         OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
         IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH,
-        ROTATION_BY_ILLEGAL_OFFSET
+        STORAGE_LAYOUT_DOES_NOT_MATCH
       };
     };
 

From f5ff4d826fa5728f6319dcc7159b1337a2461880 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 27 Feb 2015 10:56:50 -0500
Subject: [PATCH 14/18] Fix NEON build flags: in the current NDK, at least with
 the clang-3.5 toolchain, -mfpu=neon is not enough to activate NEON, since
 it's incompatible with the default float ABI, and I have to pass
 -mfloat-abi=softfp (which is what everyone does in practice). In fact, it
 would be a good idea to pass -mfloat-abi=softfp all the time, regardless of
 NEON. Also removing the -mcpu=cortex-a8, as 1) it's not needed and 2) if we
 really wanted to pass a specific -mcpu flag, that would presumably to tune
 performance for benchmarks, and it would then not really make sense to tune
 for the very old cortex-a8 (it reflects ARM CPUs from 5 years ago).

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13f9c8f9d..2c1ae428e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,7 @@ if(NOT MSVC)
 
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp")
     message(STATUS "Enabling NEON in tests/examples")
   endif()
 

From 33669348c490372e598f839754257f3710195337 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 27 Feb 2015 11:35:37 -0500
Subject: [PATCH 15/18] Disable Packet2f/2i halfpacket support in NEON. I
 believe that it was erroneously turned on, since Packet2f/2i intrinsics are
 unimplemented, and code trying to use halfpackets just fails to compile on
 NEON, as it tries to use the default implementation of pload/pstore and the
 types don't match.

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index e9af45f22..12198d79c 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -76,12 +76,12 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
-  typedef Packet2f half;
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket=1,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
    
     HasDiv  = 1,
     // FIXME check the Has*
@@ -95,12 +95,12 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
-  typedef Packet2i half;
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket=1
+    HasHalfPacket=0, // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
   };
 };

From 2fc3b484d71486bad6a43488a182fa7945d65f45 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 27 Feb 2015 11:37:45 -0500
Subject: [PATCH 16/18] remove trailing comma

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 12198d79c..8dd1e1370 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -100,7 +100,7 @@ template<> struct packet_traits<int>    : default_packet_traits
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket=0, // Packet2i intrinsics not implemented yet
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
   };
 };

From 90f4e90f1d8ff6efd3ee695cbb25b3d6d6ae4fee Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 27 Feb 2015 09:22:37 -0800
Subject: [PATCH 17/18] Fixed off-by-one error that prevented the evaluation of
 small tensor expressions from being vectorized

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 1eb37bf1c..bb2f8b977 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -97,7 +97,7 @@ struct EvalRange<Evaluator, Index, true> {
 
     Index i = first;
     static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    if (last - first > PacketSize) {
+    if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
       Index lastPacket = last - (last % PacketSize);
       for (; i < lastPacket; i += PacketSize) {

From 6466fa63be51b32c493401bb7cf6aa4b2e1ef385 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 27 Feb 2015 15:30:10 -0500
Subject: [PATCH 18/18] Reimplement the selection between rotating and
 non-rotating kernels using templates instead of macros and if()'s. That was
 needed to fix the build of unit tests on ARM, which I had broken. My bad for
 not testing earlier.

---
 Eigen/src/Core/GenericPacketMath.h            |   7 +-
 .../Core/products/GeneralBlockPanelKernel.h   | 155 +++++++++++-------
 2 files changed, 95 insertions(+), 67 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index afdd6f71e..e48c064ce 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -290,11 +290,8 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 template<size_t offset, typename Packet>
 struct protate_impl
 {
-  static Packet run(const Packet& a)
-  {
-    eigen_assert(false && "unimplemented");
-    return a;
-  }
+  // Empty so attempts to use this unimplemented path will fail to compile.
+  // Only specializations of this template should be used.
 };
 
 /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 06c7f362e..9061fd936 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -800,6 +800,80 @@ protected:
   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 
+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+  // default implementation, not rotating
+
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+  }
+
+  void unrotateResult(AccPacket&,
+                      AccPacket&,
+                      AccPacket&,
+                      AccPacket&)
+  {
+  }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    if (Index == 0) {
+      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+    } else {
+      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+      to = protate<1>(to);
+    }
+  }
+
+  void unrotateResult(AccPacket& res0,
+                      AccPacket& res1,
+                      AccPacket& res2,
+                      AccPacket& res3)
+  {
+    PacketBlock<AccPacket> resblock;
+    resblock.packet[0] = res0;
+    resblock.packet[1] = res1;
+    resblock.packet[2] = res2;
+    resblock.packet[3] = res3;
+    ptranspose(resblock);
+    resblock.packet[3] = protate<1>(resblock.packet[3]);
+    resblock.packet[2] = protate<2>(resblock.packet[2]);
+    resblock.packet[1] = protate<3>(resblock.packet[1]);
+    ptranspose(resblock);
+    res0 = resblock.packet[0];
+    res1 = resblock.packet[1];
+    res2 = resblock.packet[2];
+    res3 = resblock.packet[3];
+  }
+};
+
 /* optimized GEneral packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
@@ -833,6 +907,16 @@ struct gebp_kernel
     ResPacketSize = Traits::ResPacketSize
   };
 
+
+  static const bool UseRotatingKernel =
+    EIGEN_ARCH_ARM &&
+    internal::is_same<LhsScalar, float>::value &&
+    internal::is_same<RhsScalar, float>::value &&
+    internal::is_same<ResScalar, float>::value &&
+    Traits::LhsPacketSize == 4 &&
+    Traits::RhsPacketSize == 4 &&
+    Traits::ResPacketSize == 4;
+
   EIGEN_DONT_INLINE
   void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                   Index rows, Index depth, Index cols, ResScalar alpha,
@@ -866,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
     {
+      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+
       // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
       for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
       {
@@ -901,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           prefetch(&blB[0]);
           LhsPacket A0, A1;
 
-#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM
-
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-          static const bool UseRotatingKernel =
-            Traits::LhsPacketSize == 4 &&
-            Traits::RhsPacketSize == 4 &&
-            Traits::ResPacketSize == 4;
-#endif
-
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
             RhsPacket B_0, T0;
             LhsPacket A2;
 
-#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \
-            traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0);
-
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
-            do { \
-              if (UseRotatingKernel) { \
-                if (N == 0) { \
-                  B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
-                } else { \
-                  EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
-                  B_0 = protate<1>(B_0); \
-                } \
-              } else { \
-                EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \
-              } \
-            } while (false)
-#else
-#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
-            EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N)
-#endif
-
 #define EIGEN_GEBP_ONESTEP(K) \
             do { \
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
@@ -947,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
               traits.madd(A0, B_0, C0, T0); \
               traits.madd(A1, B_0, C4, T0); \
               traits.madd(A2, B_0, C8, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
               traits.madd(A0, B_0, C1, T0); \
               traits.madd(A1, B_0, C5, T0); \
               traits.madd(A2, B_0, C9, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
               traits.madd(A0, B_0, C2,  T0); \
               traits.madd(A1, B_0, C6,  T0); \
               traits.madd(A2, B_0, C10, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
               traits.madd(A0, B_0, C3 , T0); \
               traits.madd(A1, B_0, C7,  T0); \
               traits.madd(A2, B_0, C11, B_0); \
@@ -992,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           }
 
 #undef EIGEN_GEBP_ONESTEP
-#undef EIGEN_GEBP_ONESTEP_LOADRHS
-#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING
 
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-          if (UseRotatingKernel) {
-            #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
-              do { \
-                PacketBlock<ResPacket> resblock; \
-                resblock.packet[0] = res0; \
-                resblock.packet[1] = res1; \
-                resblock.packet[2] = res2; \
-                resblock.packet[3] = res3; \
-                ptranspose(resblock); \
-                resblock.packet[3] = protate<1>(resblock.packet[3]); \
-                resblock.packet[2] = protate<2>(resblock.packet[2]); \
-                resblock.packet[1] = protate<3>(resblock.packet[1]); \
-                ptranspose(resblock); \
-                res0 = resblock.packet[0]; \
-                res1 = resblock.packet[1]; \
-                res2 = resblock.packet[2]; \
-                res3 = resblock.packet[3]; \
-              } while (false)
-            
-            EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
-            EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
-            EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
-          }
-#endif
+          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
 
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);