From d6568425f8b384ef36da1f16be76a3977fb90846 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 31 Jul 2018 08:13:43 +0000
Subject: [PATCH 01/10] Close branch tiling_3.


From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 31 Jul 2018 15:56:31 -0700
Subject: [PATCH 02/10] Add block evaluation support to TensorOps

---
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      |  14 +-
 .../CXX11/src/Tensor/TensorBroadcasting.h     | 333 +++++++++++-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   | 144 +++++-
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 218 +++++++-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   | 333 ++++++++++--
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 437 +++++++++++++++-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  | 237 ++++++++-
 unsupported/test/cxx11_tensor_executor.cpp    | 480 ++++++++++++++++--
 8 files changed, 2047 insertions(+), 149 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 84cf6d216..33c4ef5b7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -214,7 +214,7 @@ class TensorBlockIO {
         num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
     const StorageIndex block_dim_for_tensor_stride1_dim =
         NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
-    size_t block_inner_dim_size =
+    Index block_inner_dim_size =
         NumDims == 0 ? 1
                      : block.block_sizes()[block_dim_for_tensor_stride1_dim];
     for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
@@ -745,16 +745,15 @@ class TensorBlockMapper {
       if (block_shape == TensorBlockShapeType::kUniformAllDims) {
         // Tensor will not fit within 'min_target_size' budget: calculate tensor
         // block dimension sizes based on "square" dimension size target.
-        const size_t dim_size_target = static_cast<const size_t>(
+        const Index dim_size_target = static_cast<Index>(
             std::pow(static_cast<float>(min_target_size),
                      1.0 / static_cast<float>(block_dim_sizes.rank())));
-        for (size_t i = 0; i < block_dim_sizes.rank(); ++i) {
+        for (Index i = 0; i < block_dim_sizes.rank(); ++i) {
           // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
           // a multiple of the packet size. Note that reducing
           // 'block_dim_size' in this manner can increase the number of
           // blocks, and so will amplify any per-block overhead.
-          block_dim_sizes[i] = numext::mini(
-              dim_size_target, static_cast<size_t>(tensor_dims[i]));
+          block_dim_sizes[i] = numext::mini(dim_size_target, tensor_dims[i]);
         }
         // Add any un-allocated coefficients to inner dimension(s).
         StorageIndex total_size = block_dim_sizes.TotalSize();
@@ -789,9 +788,8 @@ class TensorBlockMapper {
       }
     }
 
-    eigen_assert(
-        block_dim_sizes.TotalSize() >=
-        numext::mini<size_t>(min_target_size, tensor_dims.TotalSize()));
+    eigen_assert(block_dim_sizes.TotalSize() >=
+                 numext::mini<Index>(min_target_size, tensor_dims.TotalSize()));
 
     return block_dim_sizes;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 343ab6269..a851e7f55 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -108,16 +108,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   bool isCopy= false, nByOne = false, oneByN = false;
 
   enum {
-    IsAligned = true,
+    IsAligned    = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess    = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_broadcast(op.broadcast()),m_impl(op.expression(), device)
-  {
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  // Block based access to the XprType (input) tensor.
+  using TensorBlock                = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  using TensorBlockReader          = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+  // We do block based broadcasting using a a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  using BroadcastDimensions        = DSizes<Index, 2 * NumDims>;
+  using BroadcastTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
+  using BroadcastTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_device(device),
+        m_broadcast(op.broadcast()),
+        m_impl(op.expression(), device) {
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
@@ -216,8 +229,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   }
 
   // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
@@ -243,11 +255,15 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         inputIndex += (index % m_impl.dimensions()[0]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
   {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
     Index inputIndex = 0;
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
@@ -263,17 +279,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
       } else {
-        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    return m_impl.coeff(indexRowMajor(index));
   }
 
   template<int LoadMode>
@@ -553,13 +574,291 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar);
+    Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars);
+
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        block_total_size_max));
+
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (NumDims <= 0) {
+      output_block->data()[0] = m_impl.coeff(0);
+      return;
+    }
+
+    // Because we only support kSkewedInnerDims blocking, block size should be
+    // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
+    // for the first outer dim, and 1 for other outer dims. This is guaranteed
+    // by MergeResourceRequirements() in TensorBlock.h.
+    const auto& output_block_sizes = output_block->block_sizes();
+    const auto& output_block_strides = output_block->block_strides();
+
+    // Find where outer dims start.
+    int outer_dim_start = 0;
+    Index outer_dim_size = 1, inner_dim_size = 1;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i
+                          : NumDims - i - 1;
+      if (i > outer_dim_start) {
+        eigen_assert(output_block_sizes[dim] == 1);
+      } else if (output_block_sizes[dim] != m_dimensions[dim]) {
+        eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
+        outer_dim_size = output_block_sizes[dim];
+      } else {
+        inner_dim_size *= output_block_sizes[dim];
+        ++outer_dim_start;
+      }
+    }
+
+    if (inner_dim_size == 0 || outer_dim_size == 0) {
+      return;
+    }
+
+    const auto& input_dims = m_impl.dimensions();
+
+    // Pre-fill input_block_sizes, broadcast_block_sizes,
+    // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
+    // only modify the outer_dim_start-th dimension on these arrays.
+
+    // Calculate the input block size for looking into the input.
+    Dimensions input_block_sizes;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < outer_dim_start; ++i) {
+        input_block_sizes[i] = input_dims[i];
+      }
+      for (int i = outer_dim_start; i < NumDims; ++i) {
+        input_block_sizes[i] = 1;
+      }
+    } else {
+      for (int i = 0; i < outer_dim_start; ++i) {
+        input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
+      }
+      for (int i = outer_dim_start; i < NumDims; ++i) {
+        input_block_sizes[NumDims - i - 1] = 1;
+      }
+    }
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
+    //
+    // - broadcast_block_strides is [output_block_strides[0],
+    //                               output_block_strides[0] * d_0,
+    //                               output_block_strides[1],
+    //                               output_block_strides[1] * d_1,
+    //                               ...].
+    //
+    // - broadcast_tensor_strides is [output_block_strides[0],
+    //                                0,
+    //                                output_block_strides[1],
+    //                                0,
+    //                                ...].
+    BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
+        broadcast_tensor_strides;
+
+    for (int i = 0; i < outer_dim_start; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i
+                          : NumDims - i - 1;
+      const int copy_dim =
+          static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? 2 * i
+              : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim =
+          static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
+                                                                 : copy_dim - 1;
+      broadcast_block_sizes[copy_dim] = input_dims[dim];
+      broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      broadcast_block_strides[copy_dim] = output_block_strides[dim];
+      broadcast_block_strides[broadcast_dim] =
+          output_block_strides[dim] * input_dims[dim];
+      broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
+      broadcast_tensor_strides[broadcast_dim] = 0;
+    }
+    for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i
+                          : 2 * NumDims - i - 1;
+      broadcast_block_sizes[dim] = 1;
+      broadcast_block_strides[dim] = 0;
+      broadcast_tensor_strides[dim] = 0;
+    }
+
+    const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                              ? outer_dim_start
+                              : NumDims - outer_dim_start - 1;
+
+    if (outer_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                     broadcast_block_strides, broadcast_tensor_strides, 0,
+                     output_block);
+    } else if (input_dims[outer_dim] == 1) {
+      // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
+      const int broadcast_outer_dim =
+          static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? 2 * outer_dim_start + 1
+              : 2 * NumDims - 2 * outer_dim_start - 2;
+      broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
+      broadcast_tensor_strides[broadcast_outer_dim] = 0;
+      broadcast_block_strides[broadcast_outer_dim] =
+          output_block_strides[outer_dim];
+      BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                     broadcast_block_strides, broadcast_tensor_strides, 0,
+                     output_block);
+    } else {
+      // The general case. Let's denote the output block as x[...,
+      // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
+      // the outer_dim_start-th dimension (< NumDims). We need to split the
+      // a:a+outer_dim_size into possibly 3 sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      // input_dims[outer_dim_start] in [a, a+outer_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
+      // in [a, a+outer_dim_size].
+      //
+      // (3) c:a+outer_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index outer_dim_left_index =
+          output_block->first_coeff_index() / m_outputStrides[outer_dim];
+
+      // Find b and c.
+      const Index input_outer_dim_size = input_dims[outer_dim];
+
+      // First multiple after a. This is b when <= outer_dim_left_index +
+      // outer_dim_size.
+      const Index first_multiple =
+          divup<Index>(outer_dim_left_index, input_outer_dim_size) *
+          input_outer_dim_size;
+
+      if (first_multiple <= outer_dim_left_index + outer_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
+                                    input_outer_dim_size * input_outer_dim_size;
+        const int copy_outer_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 2 * outer_dim_start
+                : 2 * NumDims - 2 * outer_dim_start - 1;
+        const int broadcast_outer_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 2 * outer_dim_start + 1
+                : 2 * NumDims - 2 * outer_dim_start - 2;
+        if (first_multiple > outer_dim_left_index) {
+          const Index head_size = first_multiple - outer_dim_left_index;
+          input_block_sizes[outer_dim] = head_size;
+          broadcast_block_sizes[copy_outer_dim] = head_size;
+          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+          broadcast_block_strides[copy_outer_dim] =
+              output_block_strides[outer_dim];
+          broadcast_block_sizes[broadcast_outer_dim] = 1;
+          broadcast_tensor_strides[broadcast_outer_dim] = 0;
+          broadcast_block_strides[broadcast_outer_dim] =
+              output_block_strides[outer_dim] * input_dims[outer_dim];
+          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                         broadcast_block_strides, broadcast_tensor_strides, 0,
+                         output_block);
+        }
+        if (first_multiple < last_multiple) {
+          input_block_sizes[outer_dim] = input_outer_dim_size;
+          broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
+          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+          broadcast_block_strides[copy_outer_dim] =
+              output_block_strides[outer_dim];
+          broadcast_block_sizes[broadcast_outer_dim] =
+              (last_multiple - first_multiple) / input_outer_dim_size;
+          broadcast_tensor_strides[broadcast_outer_dim] = 0;
+          broadcast_block_strides[broadcast_outer_dim] =
+              output_block_strides[outer_dim] * input_dims[outer_dim];
+          const Index offset = (first_multiple - outer_dim_left_index) *
+                               m_outputStrides[outer_dim];
+          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                         broadcast_block_strides, broadcast_tensor_strides,
+                         offset, output_block);
+        }
+        if (last_multiple < outer_dim_left_index + outer_dim_size) {
+          const Index tail_size =
+              outer_dim_left_index + outer_dim_size - last_multiple;
+          input_block_sizes[outer_dim] = tail_size;
+          broadcast_block_sizes[copy_outer_dim] = tail_size;
+          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+          broadcast_block_strides[copy_outer_dim] =
+              output_block_strides[outer_dim];
+          broadcast_block_sizes[broadcast_outer_dim] = 1;
+          broadcast_tensor_strides[broadcast_outer_dim] = 0;
+          broadcast_block_strides[broadcast_outer_dim] =
+              output_block_strides[outer_dim] * input_dims[outer_dim];
+          const Index offset = (last_multiple - outer_dim_left_index) *
+                               m_outputStrides[outer_dim];
+          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                         broadcast_block_strides, broadcast_tensor_strides,
+                         offset, output_block);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_outer_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                ? 2 * outer_dim_start
+                : 2 * NumDims - 2 * outer_dim_start - 1;
+        input_block_sizes[outer_dim] = outer_dim_size;
+        broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
+        broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
+        broadcast_block_strides[copy_outer_dim] =
+            output_block_strides[outer_dim];
+        BroadcastBlock(input_block_sizes, broadcast_block_sizes,
+                       broadcast_block_strides, broadcast_tensor_strides, 0,
+                       output_block);
+      }
+    }
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
   Broadcast functor() const { return m_broadcast; }
 
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
+      const Dimensions& input_block_sizes,
+      const BroadcastDimensions& broadcast_block_sizes,
+      const BroadcastDimensions& broadcast_block_strides,
+      const BroadcastDimensions& broadcast_tensor_strides, Index offset,
+      TensorBlock* output_block) const {
+    TensorBlock input_view_block(
+        static_cast<int>(Layout) == static_cast<int>(ColMajor)
+            ? indexColMajor(output_block->first_coeff_index() + offset)
+            : indexRowMajor(output_block->first_coeff_index() + offset),
+        input_block_sizes, Dimensions(m_inputStrides),
+        Dimensions(m_inputStrides), NULL);
+
+    internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
+                                                           input_view_block);
+    BroadcastTensorBlock broadcast_block(
+        0, broadcast_block_sizes, broadcast_block_strides,
+        broadcast_tensor_strides, output_block->data() + offset);
+
+    BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
+  }
+
  protected:
+  const Device& m_device;
   const Broadcast m_broadcast;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 085c05f3d..a0d039e64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -144,14 +144,19 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
   {
@@ -184,6 +189,23 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     }
     m_inputStride *= input_dims[m_dim.actualDim()];
     m_inputOffset = m_stride * op.offset();
+
+    if (BlockAccess) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        m_inputStrides[NumInputDims - 1] = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      m_block_total_size_max =
+          numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -266,6 +288,60 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
            TensorOpCost(0, 0, cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  // TODO(andydavis) Reduce the overhead of this function (experiment with
+  // using a fixed block size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Calculate input block sizes.
+    const DSizes<Index, NumDims>& output_block_sizes =
+        output_block->block_sizes();
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block->block_strides();
+    const Index chip_dim = m_dim.actualDim();
+    DSizes<Index, NumInputDims> input_block_sizes;
+    DSizes<Index, NumInputDims> input_block_strides;
+    for (Index i = 0; i < NumInputDims; ++i) {
+      if (i < chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i];
+        input_block_strides[i] = output_block_strides[i];
+      } else if (i > chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i - 1];
+        input_block_strides[i] = output_block_strides[i - 1];
+      } else {
+        input_block_sizes[i] = 1;
+      }
+    }
+    // Fix up input_block_stride for chip dimension.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (chip_dim == 0) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] =
+            input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+      }
+    } else {
+      if (chip_dim == NumInputDims - 1) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] =
+            input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+      }
+    }
+    // Instantiate and read input block from input tensor.
+    InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                                 input_block_sizes, input_block_strides,
+                                 m_inputStrides, output_block->data());
+    m_impl.block(&input_block);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
     CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
     if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
@@ -316,6 +392,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
+  Index m_block_total_size_max;
+  DSizes<Index, NumInputDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
   const Device& m_device;
@@ -342,12 +420,18 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess    = false
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
@@ -395,6 +479,50 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
       }
     }
   }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const OutputTensorBlock& output_block) {
+    // Calculate input block sizes.
+    const DSizes<Index, NumDims>& output_block_sizes =
+        output_block.block_sizes();
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block.block_strides();
+    const Index chip_dim = this->m_dim.actualDim();
+    DSizes<Index, NumInputDims> input_block_sizes;
+    DSizes<Index, NumInputDims> input_block_strides;
+    for (Index i = 0; i < NumInputDims; ++i) {
+      if (i < chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i];
+        input_block_strides[i] = output_block_strides[i];
+      } else if (i > chip_dim) {
+        input_block_sizes[i] = output_block_sizes[i - 1];
+        input_block_strides[i] = output_block_strides[i - 1];
+      } else {
+        input_block_sizes[i] = 1;
+      }
+    }
+    // Fix up input_block_stride for chip dimension.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (chip_dim == 0) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] =
+            input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+      }
+    } else {
+      if (chip_dim == NumInputDims - 1) {
+        input_block_strides[chip_dim] = 1;
+      } else {
+        input_block_strides[chip_dim] =
+            input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+      }
+    }
+    // Write input block.
+    this->m_impl.writeBlock(InputTensorBlock(
+        this->srcCoeff(output_block.first_coeff_index()), input_block_sizes,
+        input_block_strides, this->m_inputStrides,
+        const_cast<ScalarNoConst*>(output_block.data())));
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 72cb2d15f..4987b898b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -54,6 +54,66 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
   typedef TensorImagePatchOp<Rows, Cols, XprType> type;
 };
 
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size =
+        (num_coeff_to_copy / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_pad, const Scalar padding_value,
+      const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size =
+        (num_coeff_to_pad / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+                                                   padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
 }  // end namespace internal
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -184,15 +244,17 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    BlockAccess  = true,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,
+    RawAccess    = false
   };
 
-  #ifdef __SYCL_DEVICE_ONLY__
+  using OutputTensorBlock = internal::TensorBlock<Scalar, Index, NumDims, Layout>;
+
+#ifdef __SYCL_DEVICE_ONLY__
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
   #else
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
@@ -342,6 +404,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     } else {
       m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
     }
+
+    m_block_total_size_max =
+        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -484,6 +549,146 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        m_block_total_size_max));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    using ImagePatchCopyOp = internal::ImagePatchCopyOp<Self, PacketAccess>;
+    using ImagePatchPaddingOp = internal::ImagePatchPaddingOp<Self>;
+
+    // Calculate loop limits and various input/output dim sizes.
+    const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
+    const bool col_major =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+    const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
+    const Index output_depth_dim_size =
+        m_dimensions[col_major ? 0 : NumDims - 1];
+    const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
+    const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
+    const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
+    const Index block_col_stride = row_dim_size * depth_dim_size;
+    const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
+    const Index outer_dim_size =
+        block_sizes.TotalSize() /
+        (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
+
+    const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
+    const Index batch_size = patch_size * patch_index_dim_size;
+
+    Index output_index = output_block->first_coeff_index();
+
+    // Loop through outer dimensions.
+    for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
+         ++outer_dim_index) {
+      const Index outer_output_base_index = outer_dim_index * batch_size;
+      // Find the offset of the element wrt the location of the first element.
+      const Index patchIndexStart = output_index / m_fastPatchStride;
+      const Index patchOffset =
+          (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
+      const Index colOffsetStart = patchOffset / m_fastColStride;
+      // Other ways to index this element.
+      const Index otherIndex =
+          (NumDims == 4) ? 0 : output_index / m_fastOtherStride;
+      const Index patch2DIndexStart =
+          (NumDims == 4)
+              ? 0
+              : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
+      // Calculate starting depth index.
+      const Index depth = output_index - (output_index / m_fastOutputDepth) *
+                                             output_depth_dim_size;
+      const Index patch_input_base_index =
+          depth + otherIndex * m_patchInputStride;
+
+      // Loop through patches.
+      for (Index patch_index_dim_index = 0;
+           patch_index_dim_index < patch_index_dim_size;
+           ++patch_index_dim_index) {
+        const Index patch_output_base_index =
+            outer_output_base_index + patch_index_dim_index * patch_size;
+        // Patch index corresponding to the passed in index.
+        const Index patchIndex = patchIndexStart + patch_index_dim_index;
+        const Index patch2DIndex =
+            (NumDims == 4) ? patchIndex
+                           : patch2DIndexStart + patch_index_dim_index;
+        const Index colIndex = patch2DIndex / m_fastOutputRows;
+        const Index input_col_base = colIndex * m_col_strides;
+        const Index row_offset_base =
+            (patch2DIndex - colIndex * m_outputRows) * m_row_strides -
+            m_rowPaddingTop;
+
+        // Loop through columns.
+        for (Index col_dim_index = 0; col_dim_index < col_dim_size;
+             ++col_dim_index) {
+          const Index col_output_base_index =
+              patch_output_base_index + col_dim_index * block_col_stride;
+
+          // Calculate col index in the input original tensor.
+          Index colOffset = colOffsetStart + col_dim_index;
+          Index inputCol =
+              input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
+          Index origInputCol =
+              (m_col_inflate_strides == 1)
+                  ? inputCol
+                  : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+
+          bool pad_column = false;
+          if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+              ((m_col_inflate_strides != 1) &&
+               (inputCol != origInputCol * m_col_inflate_strides))) {
+            pad_column = true;
+          }
+
+          const Index col_input_base_index =
+              patch_input_base_index + origInputCol * m_colInputStride;
+          const Index input_row_base =
+              row_offset_base +
+              ((patchOffset + col_dim_index * output_row_dim_size) -
+               colOffset * m_colStride) *
+                  m_in_row_strides;
+          // Loop through rows.
+          for (Index row_dim_index = 0; row_dim_index < row_dim_size;
+               ++row_dim_index) {
+            const Index output_base_index =
+                col_output_base_index + row_dim_index * depth_dim_size;
+            bool pad_row = false;
+            Index inputIndex;
+            if (!pad_column) {
+              Index inputRow =
+                  input_row_base + row_dim_index * m_in_row_strides;
+              Index origInputRow =
+                  (m_row_inflate_strides == 1)
+                      ? inputRow
+                      : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
+                                         : 0);
+              if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+                  ((m_row_inflate_strides != 1) &&
+                   (inputRow != origInputRow * m_row_inflate_strides))) {
+                pad_row = true;
+              } else {
+                inputIndex =
+                    col_input_base_index + origInputRow * m_rowInputStride;
+              }
+            }
+            // Copy (or pad) along depth dimension.
+            if (pad_column || pad_row) {
+              ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
+                                       output_base_index, output_block->data());
+            } else {
+              ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
+                                    output_block->data(), inputIndex);
+            }
+          }
+        }
+      }
+      output_index += m_otherStride;
+    }
+  }
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
@@ -538,6 +743,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   internal::TensorIntDivisor<Index> m_fastOutputDepth;
 
   Scalar m_paddingValue;
+  Index m_block_total_size_max;
 
   TensorEvaluator<ArgType, Device> m_impl;
   #ifdef EIGEN_USE_SYCL
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 498488649..3747bff9e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -102,27 +102,64 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
   typedef NewDimensions Dimensions;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  static const int NumOutputDims = internal::array_size<Dimensions>::value;
+  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    IsAligned    = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
+    // performance issue with block-based reshape is resolved.
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                   TensorEvaluator<ArgType, Device>::RawAccess &&
+                   NumInputDims > 0 && NumOutputDims > 0,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using InputTensorBlock        = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+  using OutputTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
+  using OutputTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
     // The total size of the reshaped tensor must be equal to the total size
     // of the input tensor.
     eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
-  }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+    if (BlockAccess) {
+      const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+          m_impl.dimensions();
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+        m_inputStrides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        m_outputStrides[NumOutputDims - 1] = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+        m_inputStrides[NumInputDims - 1] = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        }
+      }
+    }
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
@@ -148,6 +185,140 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_impl.getResourceRequirements(resources);
+  }
+
+  // TODO(andydavis) Reduce the overhead of this function.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    if (m_impl.data() != NULL) {
+      OutputTensorBlockReader::Run(output_block, m_impl.data());
+      return;
+    }
+
+    // Calculate output block unit-stride inner dimension length.
+    const DSizes<Index, NumOutputDims>& output_block_sizes =
+        output_block->block_sizes();
+    Index output_inner_dim_size = 1;
+    Index output_outer_dim_start = NumOutputDims;
+    for (Index i = 0; i < NumOutputDims; ++i) {
+      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                        ? i : NumOutputDims - i - 1;
+      output_inner_dim_size *= output_block_sizes[dim];
+      if (output_block_sizes[dim] < m_dimensions[dim]) {
+        output_outer_dim_start = i + 1;
+        break;
+      }
+    }
+
+    // Initialize output block iterator state.
+    struct BlockIteratorState {
+      Index stride;
+      Index span;
+      Index size;
+      Index count;
+    };
+    array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+    for (Index i = 0; i < NumOutputDims; ++i) {
+      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                        ? i : NumOutputDims - i - 1;
+      block_iter_state[i].size = output_block_sizes[dim];
+      block_iter_state[i].stride = m_outputStrides[dim];
+      block_iter_state[i].span =
+          block_iter_state[i].stride * (block_iter_state[i].size - 1);
+      block_iter_state[i].count = 0;
+    }
+
+    const Index output_outer_dim_size = output_block_sizes.TotalSize() /
+        output_inner_dim_size;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+        m_impl.dimensions();
+
+    Index index = output_block->first_coeff_index();
+    for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
+      Index inner_idx = 0;
+      while (inner_idx < output_inner_dim_size) {
+        // Calculate input coords based on 'index'.
+        array<Index, NumInputDims> input_coords;
+        Index idx = index;
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          for (int i = NumInputDims - 1; i > 0; --i) {
+            input_coords[i] = idx / m_inputStrides[i];
+            idx -= input_coords[i] * m_inputStrides[i];
+          }
+          input_coords[0] = idx;
+        } else {
+          for (int i = 0; i < NumInputDims - 1; ++i) {
+            input_coords[i] = idx / m_inputStrides[i];
+            idx -= input_coords[i] * m_inputStrides[i];
+          }
+          input_coords[NumInputDims - 1] = idx;
+        }
+
+        // Calculate target input block shape, using at most
+        // 'output_inner_dim_size' coefficients along the input block's inner
+        // dimensions.
+        DSizes<Index, NumInputDims> input_block_sizes;
+        Index num_to_allocate = output_inner_dim_size - inner_idx;
+        for (Index i = 0; i < NumInputDims; ++i) {
+          const Index dim =
+              static_cast<int>(Layout) == static_cast<int>(ColMajor)
+              ? i : NumInputDims - i - 1;
+          input_block_sizes[dim] = numext::mini(
+              num_to_allocate, (static_cast<Index>(input_dims[dim]) -
+                  input_coords[dim]));
+          if (input_coords[dim] == 0) {
+            num_to_allocate /= input_block_sizes[dim];
+          } else {
+            num_to_allocate = 1;
+          }
+        }
+
+        // Calculate input block strides.
+        DSizes<Index, NumInputDims> input_block_strides;
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          input_block_strides[0] = 1;
+          for (int i = 1; i < NumInputDims; ++i) {
+            input_block_strides[i] = input_block_strides[i - 1] *
+                input_block_sizes[i - 1];
+          }
+        } else {
+          input_block_strides[NumInputDims - 1] = 1;
+          for (int i = NumInputDims - 2; i >= 0; --i) {
+            input_block_strides[i] = input_block_strides[i + 1] *
+                input_block_sizes[i + 1];
+          }
+        }
+
+        // Instantiate and read input block from input tensor.
+        InputTensorBlock input_block(index, input_block_sizes,
+                                     input_block_strides, m_inputStrides,
+                                     output_block->data() + outer_idx *
+                                         output_inner_dim_size + inner_idx);
+
+        m_impl.block(&input_block);
+
+        const Index input_block_total_size = input_block_sizes.TotalSize();
+        index += input_block_total_size;
+        inner_idx += input_block_total_size;
+      }
+      eigen_assert(inner_idx == output_inner_dim_size);
+      index -= output_inner_dim_size;
+      // Update index.
+      for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
+        if (++block_iter_state[i].count < block_iter_state[i].size) {
+          index += block_iter_state[i].stride;
+          break;
+        }
+        block_iter_state[i].count = 0;
+        index -= block_iter_state[i].span;
+      }
+    }
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
 
   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -155,6 +326,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
   NewDimensions m_dimensions;
+  DSizes<Index, NumOutputDims> m_outputStrides;
+  DSizes<Index, NumInputDims> m_inputStrides;
 };
 
 
@@ -322,17 +495,27 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,
+    RawAccess    = false
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using TensorBlock =  internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
@@ -340,6 +523,16 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
     }
 
+    m_is_identity = true;
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >=
+                   op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] ||
+          op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
+    }
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -367,13 +560,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
-  }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
+    m_block_total_size_max =
+        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
@@ -417,7 +607,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   template<int LoadMode>
@@ -427,6 +621,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
+
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -469,9 +667,26 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                            output_block->block_sizes(),
+                            output_block->block_strides(),
+                            Dimensions(m_inputStrides),
+                            output_block->data());
+    m_impl.block(&input_block);
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
     Scalar* result = m_impl.data();
@@ -544,7 +759,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   TensorEvaluator<ArgType, Device> m_impl;
   const Device& m_device;
   Dimensions m_dimensions;
+  bool m_is_identity;
   const StartIndices m_offsets;
+  Index m_block_total_size_max;
 };
 
 
@@ -557,33 +774,46 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Sizes Dimensions;
 
+  enum {
+    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,
+    RawAccess    = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
@@ -623,6 +853,14 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       }
     }
   }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    this->m_impl.writeBlock(TensorBlock(
+        this->srcCoeff(block.first_coeff_index()), block.block_sizes(),
+        block.block_strides(), Dimensions(this->m_inputStrides),
+        const_cast<ScalarNoConst*>(block.data())));
+  }
 };
 
 
@@ -739,7 +977,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   {
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
     DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+    m_is_identity = true;
     for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      if (m_strides[i] != 1 || op.startIndices()[i] != 0 ||
+          op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) {
+        m_is_identity = false;
+      }
+
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
       if(m_strides[i]>0){
         startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
@@ -822,11 +1066,15 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
@@ -873,6 +1121,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
   TensorEvaluator<ArgType, Device> m_impl;
   const Device& m_device;
   DSizes<Index, NumDims> m_startIndices; // clamped startIndices
@@ -916,7 +1165,11 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 375fc0802..05c0990dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -356,6 +356,70 @@ template <int NPT, typename S, typename R, typename I>
 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+template <typename Self, typename Op,
+          bool Vectorizable =
+              (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
+class BlockReducer {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    for (Index i = 0; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() { return op_.finalize(accum_); }
+  PacketReturnType FinalizePacket() {
+    // TODO(andydavis) This function should not be called for Scalar
+    // reductions: clean this up or add an assert here.
+    return PacketReturnType();
+  }
+
+ private:
+  CoeffReturnType accum_;
+  Op op_;
+};
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  static const Index PacketSize =
+      internal::unpacket_traits<PacketReturnType>::size;
+
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    vaccum_ = op_.template initializePacket<PacketReturnType>();
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    const Index vectorized_size =
+        (num_values_to_reduce / PacketSize) * PacketSize;
+    for (Index i = 0; i < vectorized_size; i += PacketSize) {
+      op_.reducePacket(
+          internal::ploadt<PacketReturnType, Unaligned>(&data[index + i]),
+          &vaccum_);
+    }
+    for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); }
+  PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); }
+
+ private:
+  PacketReturnType vaccum_;
+  CoeffReturnType accum_;
+  Op op_;
+};
+
 }  // end namespace internal
 
 
@@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -410,14 +475,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
+  using ScalarNoConst =  typename internal::remove_const<Scalar>::type;
+
+  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
+  using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
@@ -451,11 +521,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         m_outputStrides[0] = 1;
         for (int i = 1; i < NumOutputDims; ++i) {
           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides.back() = 1;
+        m_outputStrides[NumOutputDims - 1] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       }
     }
@@ -483,6 +555,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           ++reduceIndex;
         } else {
           m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
           ++outputIndex;
         }
       }
@@ -492,6 +565,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
+
+    m_numValuesToReduce =
+        NumOutputDims == 0
+            ? internal::array_prod(input_dims)
+            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                  ? m_preservedStrides[0]
+                  : m_preservedStrides[NumOutputDims - 1];
+
+    m_block_total_size_max =
+        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -686,6 +769,265 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Special case full reductions to avoid input block copy below.
+    if (NumInputDims == NumReducedDims) {
+      eigen_assert(output_block->first_coeff_index() == 0);
+      eigen_assert(output_block->block_sizes().TotalSize() == 1);
+      Op reducer(m_reducer);
+      output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
+          *this, 0, m_numValuesToReduce, reducer);
+      return;
+    }
+
+    // Calculate input tensor 'slice' required to reduce output block coeffs.
+    DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
+    for (int i = 0; i < NumOutputDims; ++i) {
+      // Clip preserved input dimensions by output block size.
+      input_slice_sizes[m_output_to_input_dim_map[i]] =
+          output_block->block_sizes()[i];
+    }
+
+    // Shard input tensor slice into blocks (because it could be large if we
+    // need to reduce along several dimensions to calculate required output
+    // coefficients).
+    const Index max_coeff_count =
+        numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
+                            input_slice_sizes.TotalSize());
+
+    // Calculate max output shard size needed to keep working set of reducers
+    // in L1, while leaving enough space for reducer overhead and 'PacketSize'
+    // reductions.
+    DSizes<Index, NumInputDims> target_input_block_sizes;
+    CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
+                                   &target_input_block_sizes);
+    // Calculate indices for first preserved dimension.
+    const Index first_preserved_dim_output_index =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor)
+            ? 0
+            : NumOutputDims - 1;
+    const Index first_preserved_dim_input_index =
+        m_output_to_input_dim_map[first_preserved_dim_output_index];
+    const bool inner_most_dim_preserved =
+        first_preserved_dim_input_index ==
+            (static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                 ? 0
+                 : NumInputDims - 1) |
+        PreservingInnerMostDims;
+
+    // Calculate output block inner/outer dimension sizes.
+    const Index output_block_inner_dim_size =
+        output_block->block_sizes()[first_preserved_dim_output_index];
+    const Index output_block_outer_dim_size =
+        output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
+    // Calculate shard size for first preserved dimension.
+    const Index output_shard_size =
+        target_input_block_sizes[first_preserved_dim_input_index];
+    const Index num_output_shards =
+        (output_block_inner_dim_size + output_shard_size - 1) /
+        output_shard_size;
+
+    // Initialize 'tensor_slice_offsets' from input coords of output index.
+    DSizes<Index, NumInputDims> tensor_slice_offsets;
+    GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
+                                 &tensor_slice_offsets);
+
+    // Store tensor slice offset in first preserved dimension to be used
+    // to update tensor slice extents in loop below.
+    const Index first_preserved_dim_offset_start =
+        tensor_slice_offsets[first_preserved_dim_input_index];
+
+    array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+    // Initialize state used to iterate through output coefficients
+    // and update 'tensor_slice_offsets' in outer preserved dims.
+    for (int i = 0; i < NumOutputDims - 1; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i + 1
+                          : NumOutputDims - i - 2;
+      block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
+      block_iter_state[i].output_size = output_block->block_sizes()[dim];
+      block_iter_state[i].output_count = 0;
+    }
+
+    // Allocate input block memory.
+    ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
+        m_device.allocate(max_coeff_count * sizeof(Scalar)));
+    // Allocate reducer memory.
+    const bool packet_reductions_enabled =
+        (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
+    const Index num_reducers =
+        (inner_most_dim_preserved && packet_reductions_enabled)
+            ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
+               PacketSize)
+            : output_shard_size;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    BlockReducer* reducers = static_cast<BlockReducer*>(
+        m_device.allocate(num_reducers * sizeof(BlockReducer)));
+
+    InputDimensions input_tensor_dims(m_impl.dimensions());
+    for (Index output_outer_index = 0;
+         output_outer_index < output_block_outer_dim_size;
+         ++output_outer_index) {
+      for (Index output_shard_index = 0; output_shard_index < num_output_shards;
+           ++output_shard_index) {
+        // Initialize 'tensor_slice_extents' for this output shard.
+        DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
+        for (int i = 0; i < NumInputDims; ++i) {
+          if (i == first_preserved_dim_input_index) {
+            // Clip first preserved dim size to output shard size.
+            tensor_slice_extents[i] = numext::mini(
+                output_shard_size,
+                input_slice_sizes[i] - (tensor_slice_offsets[i] -
+                                        first_preserved_dim_offset_start));
+
+          } else if (!m_reduced[i]) {
+            // Clip outer preserved dims to size 1, so that we reduce a
+            // contiguous set of output coefficients.
+            tensor_slice_extents[i] = 1;
+          }
+        }
+
+        // Intialize output coefficient reducers.
+        for (int i = 0; i < num_reducers; ++i) {
+          new (&reducers[i]) BlockReducer(m_reducer);
+        }
+
+        using TensorSliceBlockMapper =
+            internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
+                                             Layout>;
+
+        // TODO(andydavis) Consider removing 'input_block_stride_order' if we
+        // find that scattered reads are not worth supporting in
+        // TensorSliceBlockMapper.
+        TensorSliceBlockMapper block_mapper(
+            input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
+            target_input_block_sizes, DimensionList<Index, NumInputDims>());
+
+        const Index num_outputs_to_update =
+            tensor_slice_extents[first_preserved_dim_input_index];
+        const Index preserved_dim_vector_reducer_count =
+            (inner_most_dim_preserved && packet_reductions_enabled)
+                ? num_outputs_to_update / PacketSize
+                : 0;
+        const Index preserved_dim_vector_coeff_count =
+            inner_most_dim_preserved
+                ? preserved_dim_vector_reducer_count * PacketSize
+                : 0;
+        const Index preserved_dim_reducer_limit =
+            (inner_most_dim_preserved && packet_reductions_enabled)
+                ? (preserved_dim_vector_reducer_count +
+                   num_outputs_to_update % PacketSize)
+                : num_outputs_to_update;
+
+        const Index total_block_count = block_mapper.total_block_count();
+        for (Index b = 0; b < total_block_count; ++b) {
+          InputTensorBlock input_block =
+              block_mapper.GetBlockForIndex(b, input_block_data);
+          // Read.
+          m_impl.block(&input_block);
+
+          Index num_values_to_reduce = 1;
+          for (Index i = 0; i < NumInputDims; ++i) {
+            if (m_reduced[i]) {
+              num_values_to_reduce *= input_block.block_sizes()[i];
+            }
+          }
+          // Reduce.
+          if (inner_most_dim_preserved) {
+            const Index input_outer_dim_size =
+                input_block.block_sizes().TotalSize() / num_outputs_to_update;
+            for (Index input_outer_dim_index = 0;
+                 input_outer_dim_index < input_outer_dim_size;
+                 ++input_outer_dim_index) {
+              const Index input_outer_dim_base =
+                  input_outer_dim_index * num_outputs_to_update;
+              for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+                reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
+                                   PacketSize, input_block.data());
+              }
+              const Index scalar_reducer_base =
+                  input_outer_dim_base + preserved_dim_vector_coeff_count;
+              for (Index i = preserved_dim_vector_reducer_count;
+                   i < preserved_dim_reducer_limit; ++i) {
+                reducers[i].Reduce(scalar_reducer_base + i -
+                                       preserved_dim_vector_reducer_count,
+                                   1, input_block.data());
+              }
+            }
+          } else {
+            for (Index i = 0; i < num_outputs_to_update; ++i) {
+              reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
+                                 input_block.data());
+            }
+          }
+        }
+
+        // Finalize all reducers for this output shard.
+        const Index output_base_index =
+            output_outer_index * output_block_inner_dim_size +
+            output_shard_index * output_shard_size;
+        if (inner_most_dim_preserved) {
+          EIGEN_ALIGN_MAX
+              typename internal::remove_const<CoeffReturnType>::type
+                  values[PacketSize];
+          for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+            const Index reducer_base = output_base_index + i * PacketSize;
+            internal::pstore<CoeffReturnType, PacketReturnType>(
+                values, reducers[i].FinalizePacket());
+            for (Index j = 0; j < PacketSize; ++j) {
+              output_block->data()[reducer_base + j] = values[j];
+            }
+          }
+          const Index scalar_reducer_base =
+              output_base_index + preserved_dim_vector_coeff_count;
+
+          for (Index i = preserved_dim_vector_reducer_count;
+               i < preserved_dim_reducer_limit; ++i) {
+            output_block->data()[scalar_reducer_base + i -
+                                 preserved_dim_vector_reducer_count] =
+                reducers[i].Finalize();
+          }
+        } else {
+          for (int i = 0; i < num_outputs_to_update; ++i) {
+            output_block->data()[output_base_index + i] =
+                reducers[i].Finalize();
+          }
+        }
+
+        // Update 'tensor_slice_offsets' by num outputs for this output shard.
+        tensor_slice_offsets[first_preserved_dim_input_index] +=
+            num_outputs_to_update;
+      }
+      // Update slice offset for inner preserved dim.
+      tensor_slice_offsets[first_preserved_dim_input_index] -=
+          output_block_inner_dim_size;
+      // Update slice offsets for remaining output dims.
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        BlockIteratorState& b = block_iter_state[i];
+        if (++b.output_count < b.output_size) {
+          ++tensor_slice_offsets[b.input_dim];
+          break;
+        }
+        b.output_count = 0;
+        tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
+      }
+    }
+
+    // Free memory.
+    m_device.deallocate(input_block_data);
+    m_device.deallocate(reducers);
+  }
+
   EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
 
 #if defined(EIGEN_USE_SYCL)
@@ -722,6 +1064,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -764,16 +1112,90 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     return startInput;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
+      Index index,
+      DSizes<Index, NumInputDims>* coords) const {
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*coords)[i] = 0;
+    }
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[0]] = index;
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
+      const Index max_coeff_count,
+      const DSizes<Index, NumInputDims>& input_slice_sizes,
+      DSizes<Index, NumInputDims>* target_input_block_sizes) const {
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    // TODO(andydavis) Compute reducer overhead correctly for the case where
+    // we are preserving the inner most dimension, and a single reducer
+    // reduces a packet's worth of output coefficients.
+    const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
+
+    Index coeff_to_allocate = max_coeff_count;
+    bool first_preserved_dim_allocated = false;
+    bool first_reduced_dim_allocated = false;
+    for (int i = 0; i < NumInputDims; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i
+                          : NumInputDims - i - 1;
+      (*target_input_block_sizes)[dim] = 1;
+      if (m_reduced[dim]) {
+        // TODO(andydavis) Consider allocating to multiple reduced dimensions.
+        // Watch out for cases where reduced dimensions are not contiguous,
+        // which induces scattered reads.
+        if (!first_reduced_dim_allocated) {
+          (*target_input_block_sizes)[dim] =
+              numext::mini(input_slice_sizes[dim], coeff_to_allocate);
+          coeff_to_allocate /= (*target_input_block_sizes)[dim];
+          first_reduced_dim_allocated = true;
+        }
+      } else if (!first_preserved_dim_allocated) {
+        // TODO(andydavis) Include output block size in this L1 working set
+        // calculation.
+        const Index allocated = max_coeff_count - coeff_to_allocate;
+        const Index alloc_size = numext::maxi(
+            static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
+        (*target_input_block_sizes)[dim] =
+            numext::mini(input_slice_sizes[dim], alloc_size);
+        coeff_to_allocate = numext::maxi(
+            static_cast<Index>(1),
+            coeff_to_allocate /
+                ((*target_input_block_sizes)[dim] * reducer_overhead));
+        first_preserved_dim_allocated = true;
+      }
+    }
+  }
+
   // Bitmap indicating if an input dimension is reduced or not.
   array<bool, NumInputDims> m_reduced;
   // Dimensions of the output of the operation.
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
   // Subset of strides of the input tensor for the non-reduced dimensions.
   // Indexed by output dimensions.
   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
   array<Index, NumPreservedStrides> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, NumOutputDims> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
@@ -782,6 +1204,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Indexed by reduced dimensions.
   array<Index, NumReducedDims> m_reducedDims;
 
+  // Block size for tiled (aka TensorBlock) evaluation.
+  Index m_block_total_size_max;
+
   // Evaluator for the input expression.
   TensorEvaluator<ArgType, Device> m_impl;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 6b54f40ad..3add9dac0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
 template<typename Shuffle, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -110,44 +111,60 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
       m_dimensions[i] = input_dims[shuffle[i]];
+      m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
     }
 
-    array<Index, NumDims> inputStrides;
-
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputStrides[0] = 1;
+      m_unshuffledInputStrides[0] = 1;
       m_outputStrides[0] = 1;
+
       for (int i = 1; i < NumDims; ++i) {
-        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     } else {
-      inputStrides[NumDims - 1] = 1;
+      m_unshuffledInputStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims - 1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
-        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
 
     for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = inputStrides[shuffle[i]];
+      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
+
+    m_block_total_size_max =
+        numext::maxi<Index>(1, device.firstLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -162,29 +179,151 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template<int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+  }
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kUniformAllDims,
+        m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (m_impl.data() != NULL) {
+      // Fast path: we have direct access to the data, so shuffle as we read.
+      TensorBlockReader::Run(output_block,
+                             srcCoeff(output_block->first_coeff_index()),
+                             m_inverseShuffle,
+                             m_unshuffledInputStrides,
+                             m_impl.data());
+      return;
+    }
+
+    // Slow path: read unshuffled block from the input and shuffle in-place.
+    // Initialize input block sizes using input-to-output shuffle map.
+    DSizes<Index, NumDims> input_block_sizes;
+    for (Index i = 0; i < NumDims; ++i) {
+      input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
+    }
+
+    // Calculate input block strides.
+    DSizes<Index, NumDims> input_block_strides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      input_block_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        input_block_strides[i] =
+            input_block_strides[i - 1] * input_block_sizes[i - 1];
+      }
+    } else {
+      input_block_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        input_block_strides[i] =
+            input_block_strides[i + 1] * input_block_sizes[i + 1];
+      }
+    }
+
+    // Read input block.
+    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+                            input_block_sizes,
+                            input_block_strides,
+                            Dimensions(m_unshuffledInputStrides),
+                            output_block->data());
+
+    m_impl.block(&input_block);
+
+    // Naive In-place shuffle: random IO but block size is O(L1 cache size).
+    // TODO(andydavis) Improve the performance of this in-place shuffle.
+    const Index total_size = input_block_sizes.TotalSize();
+    std::vector<bool> bitmap(total_size, false);
+    ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
+    const DSizes<Index, NumDims>& output_block_strides =
+        output_block->block_strides();
+    for (Index input_index = 0; input_index < total_size; ++input_index) {
+      if (bitmap[input_index]) {
+        // Coefficient at this index has already been shuffled.
+        continue;
+      }
+
+      Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
+                                               output_block_strides);
+      if (output_index == input_index) {
+        // Coefficient already in place.
+        bitmap[output_index] = true;
+        continue;
+      }
+
+      // The following loop starts at 'input_index', and shuffles
+      // coefficients into their shuffled location at 'output_index'.
+      // It skips through the array shuffling coefficients by following
+      // the shuffle cycle starting and ending a 'start_index'.
+      ScalarNoConst evicted_value;
+      ScalarNoConst shuffled_value = data[input_index];
+      do {
+        evicted_value = data[output_index];
+        data[output_index] = shuffled_value;
+        shuffled_value = evicted_value;
+        bitmap[output_index] = true;
+        output_index = GetBlockOutputIndex(output_index, input_block_strides,
+                                           output_block_strides);
+      } while (output_index != input_index);
+
+      data[output_index] = shuffled_value;
+      bitmap[output_index] = true;
     }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                            2 * TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>());
     return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
   }
 
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
@@ -195,27 +334,57 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
 
  protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+      Index input_index,
+      const DSizes<Index, NumDims>& input_block_strides,
+      const DSizes<Index, NumDims>& output_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[0];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[NumDims - 1];
     }
   }
+
   Dimensions m_dimensions;
+  bool m_is_identity;
+  array<Index, NumDims> m_inverseShuffle;
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+  Index m_block_total_size_max;
   TensorEvaluator<ArgType, Device> m_impl;
   /// required by sycl
   Shuffle m_shuffle;
@@ -239,12 +408,18 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    BlockAccess = false,
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess    = false
   };
 
+  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  using TensorBlockWriter = internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
@@ -265,6 +440,14 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
       this->coeffRef(index+i) = values[i];
     }
   }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+    TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
+                           this->m_inverseShuffle,
+                           this->m_unshuffledInputStrides, this->m_impl.data());
+  }
 };
 
 
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 274f901ce..448f47f1d 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -18,22 +18,30 @@ using Eigen::RowMajor;
 using Eigen::ColMajor;
 
 // A set of tests to verify that different TensorExecutor strategies yields the
-// same results for all the ops, supporting tiled execution.
+// same results for all the ops, supporting tiled evaluation.
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
 
-template <typename Device, bool Vectorizable, bool Tileable, int Layout>
-static void test_execute_binary_expr(Device d) {
   // Pick a large enough tensor size to bypass small tensor block evaluation
   // optimization.
-  int d0 = internal::random<int>(100, 200);
-  int d1 = internal::random<int>(100, 200);
-  int d2 = internal::random<int>(100, 200);
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
 
-  static constexpr int Options = 0;
-  using IndexType = int;
-
-  Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
-  Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
-  Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
 
   lhs.setRandom();
   rhs.setRandom();
@@ -46,33 +54,389 @@ static void test_execute_binary_expr(Device d) {
 
   Executor::run(Assign(dst, expr), d);
 
-  for (int i = 0; i < d0; ++i) {
-    for (int j = 0; j < d1; ++j) {
-      for (int k = 0; k < d2; ++k) {
-        float sum = lhs(i, j, k) + rhs(i, j, k);
-        VERIFY_IS_EQUAL(sum, dst(i, j, k));
-      }
-    }
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
   }
 }
 
-#define CALL_SUBTEST_COMBINATIONS(NAME)                                        \
-  CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
-  CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device)));  \
-  CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device)));  \
-  CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device)));   \
-  CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
-  CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device)));  \
-  CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device)));  \
-  CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device)));   \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device)));   \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device)));    \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device)));    \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device)));     \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device)));   \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device)));    \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device)));    \
-  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_chipping_rvalue(Device d) {
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout, Index> src(dims);
+  src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM)                                           \
+  if (NumDims > (CHIP_DIM)) {                                             \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+    const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> golden;                         \
+    golden = expr;                                                        \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
+                                                                          \
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
+    using Executor = internal::TensorExecutor<const Assign, Device,       \
+                                              Vectorizable, Tileable>;    \
+                                                                          \
+    Executor::run(Assign(dst, expr), d);                                  \
+                                                                          \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
+    }                                                                     \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    bool Tileable, int Layout>
+static void test_execute_chipping_lvalue(Device d) {
+  auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM)                                             \
+  if (NumDims > (CHIP_DIM)) {                                               \
+    /* Generate random data that we'll assign to the chipped tensor dim. */ \
+    array<Index, NumDims - 1> src_dims;                                     \
+    for (int i = 0; i < NumDims - 1; ++i) {                                 \
+      int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
+      src_dims[i] = dims[dim];                                              \
+    }                                                                       \
+                                                                            \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
+    src.setRandom();                                                        \
+                                                                            \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
+                                                                            \
+    /* Generate random data to fill non-chipped dimensions*/                \
+    Tensor<T, NumDims, Layout, Index> random(dims);                         \
+    random.setRandom();                                                     \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> golden(dims);                         \
+    golden = random;                                                        \
+    golden.template chip<(CHIP_DIM)>(offset) = src;                         \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> dst(dims);                            \
+    dst = random;                                                           \
+    auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
+                                                                            \
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
+    using Executor = internal::TensorExecutor<const Assign, Device,         \
+                                              Vectorizable, Tileable>;      \
+                                                                            \
+    Executor::run(Assign(expr, src), d);                                    \
+                                                                            \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
+    }                                                                       \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_shuffle_rvalue(Device d) {
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  const auto expr = src.shuffle(shuffle);
+
+  // We assume that shuffling on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the shuffling using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_shuffle_lvalue(Device d) {
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  array<Index, NumDims> shuffled_dims;
+  for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+  // We assume that shuffling on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+  golden.shuffle(shuffle) = src;
+
+  // Now do the shuffling using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+
+  auto expr = dst.shuffle(shuffle);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(expr, src), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_reduction(Device d)
+{
+  static_assert(NumDims >= 2);
+
+  static constexpr int ReducedDims = NumDims - 2;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick two random and unique reduction dimensions.
+  int reduction0 = internal::random<int>(0, NumDims - 1);
+  int reduction1 = internal::random<int>(0, NumDims - 1);
+  while (reduction0 == reduction1) {
+    reduction1 = internal::random<int>(0, NumDims - 1);
+  }
+
+  DSizes<Index, 2> reduction_axis;
+  reduction_axis[0] = reduction0;
+  reduction_axis[1] = reduction1;
+
+  Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
+
+  // Now do the reduction using configured tensor executor.
+  Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.sum(reduction_axis);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    bool Tileable, int Layout>
+static void test_execute_reshape(Device d)
+{
+  static_assert(NumDims >= 2);
+
+  static constexpr int ReshapedDims = NumDims - 1;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Multiple 0th dimension and then shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, ReshapedDims> reshaped_dims;
+  reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+  for (int i = 2; i < NumDims; ++i) reshaped_dims[shuffle[i]] = dims[i];
+
+  Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.reshape(reshaped_dims);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+  static_assert(NumDims >= 2);
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> golden =
+      src.slice(slice_start, slice_size);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    bool Tileable, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+  static_assert(NumDims >= 2);
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> slice(slice_size);
+  slice.setRandom();
+
+  // Asign a slice using default executor.
+  Tensor<T, NumDims, Options, Index> golden = src;
+  golden.slice(slice_start, slice_size) = slice;
+
+  // And using configured execution strategy.
+  Tensor<T, NumDims, Options, Index> dst = src;
+  auto expr = dst.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(expr, slice), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+#define CALL_SUBTEST_COMBINATIONS(NAME, T, NUM_DIMS)                                           \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    false, false, ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    true,  false, ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    true,  true,  ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    false, false, RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    true,  false, RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, DefaultDevice,    true,  true,  RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  false, ColMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  true,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  RowMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  false, RowMajor>(tp_device)));      \
+  CALL_SUBTEST((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  true,  RowMajor>(tp_device)))
 
 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   Eigen::DefaultDevice default_device;
@@ -81,7 +445,49 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   Eigen::ThreadPool tp(num_threads);
   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
 
-  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
+  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_broadcasting, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_chipping_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_shuffle_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 2);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reduction, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_reshape, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_slice_lvalue, float, 5);
 }
 
 #undef CALL_SUBTEST_COMBINATIONS

From 64abdf1d7eb17174f571751346dd0cbadcf3bc52 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 1 Aug 2018 12:35:19 -0700
Subject: [PATCH 03/10] Fix typo + get rid of redundant member variables for
 block sizes

---
 .../CXX11/src/Tensor/TensorBroadcasting.h      |  6 +++---
 .../Eigen/CXX11/src/Tensor/TensorChipping.h    | 17 ++++++++---------
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h  | 11 +++++------
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h    | 12 +++---------
 .../Eigen/CXX11/src/Tensor/TensorReduction.h   | 10 +++-------
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h   | 18 ++++++++++--------
 6 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6dbe5a22..cca14aafd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   // Block based access to the XprType (input) tensor.
   using TensorBlock                = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
   using TensorBlockReader          = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
-  // We do block based broadcasting using a a trick with 2x tensor rank and 0
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
   // strides. See block method implementation for details.
   using BroadcastDimensions        = DSizes<Index, 2 * NumDims>;
   using BroadcastTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
@@ -589,8 +589,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
-    Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar);
-    Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars);
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
 
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 7579ab507..aca2ead12 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -202,9 +202,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
           m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
         }
       }
-
-      m_block_total_size_max =
-          numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
     }
   }
 
@@ -290,9 +287,11 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -370,13 +369,14 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   {
     Index inputIndex;
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer
+      // division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -392,7 +392,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
-  Index m_block_total_size_max;
   DSizes<Index, NumInputDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 39759b6c3..a8247be90 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -259,7 +259,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   #else
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
   #endif
-      : m_impl(op.expression(), device)
+      : m_device(device), m_impl(op.expression(), device)
 #ifdef EIGEN_USE_SYCL
       , m_op(op)
 #endif
@@ -404,9 +404,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     } else {
       m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -551,9 +548,11 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
@@ -743,8 +742,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   internal::TensorIntDivisor<Index> m_fastOutputDepth;
 
   Scalar m_paddingValue;
-  Index m_block_total_size_max;
 
+  const Device& m_device;
   TensorEvaluator<ArgType, Device> m_impl;
   #ifdef EIGEN_USE_SYCL
   // Required for SYCL in order to construct the expression tree on the device
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 2630311b8..6ddded0bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -560,9 +560,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -672,9 +669,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -761,7 +760,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   Dimensions m_dimensions;
   bool m_is_identity;
   const StartIndices m_offsets;
-  Index m_block_total_size_max;
 };
 
 
@@ -1047,9 +1045,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
       }
     }
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -1128,7 +1123,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
-  std::size_t m_block_total_size_max;
   //use by sycl
   const StartIndices m_exprStartIndices;
   //use by sycl
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c41783106..73675e7dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -572,9 +572,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
             : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
                   ? m_preservedStrides[0]
                   : m_preservedStrides[NumOutputDims - 1];
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -771,9 +768,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -1204,9 +1203,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Indexed by reduced dimensions.
   array<Index, NumReducedDims> m_reducedDims;
 
-  // Block size for tiled (aka TensorBlock) evaluation.
-  Index m_block_total_size_max;
-
   // Evaluator for the input expression.
   TensorEvaluator<ArgType, Device> m_impl;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 77f47bf64..f94c1380d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -124,8 +124,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
   using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_device(device),
+        m_impl(op.expression(), device),
+        m_shuffle(op.shufflePermutation())
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
@@ -162,9 +165,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     for (int i = 0; i < NumDims; ++i) {
       m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.firstLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -226,9 +226,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kUniformAllDims,
-        m_block_total_size_max));
+        internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -384,7 +385,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_unshuffledInputStrides;
-  Index m_block_total_size_max;
+
+  const Device& m_device;
   TensorEvaluator<ArgType, Device> m_impl;
   /// required by sycl
   Shuffle m_shuffle;

From 1b0373ae10687ecc51ad9a0bfd46aa4ee116ade1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 1 Aug 2018 15:55:46 -0700
Subject: [PATCH 04/10] Replace all using declarations with typedefs in Tensor
 ops

---
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      | 20 +++++-----
 .../CXX11/src/Tensor/TensorBroadcasting.h     | 17 +++++---
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   | 16 +++++---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 40 ++++++++++---------
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h |  7 ++--
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   | 22 ++++++----
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 14 ++++---
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  | 16 +++++---
 8 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 33c4ef5b7..1db8d6124 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -152,11 +152,11 @@ struct TensorBlockCopyOp {
     const Scalar* src_base = &src_data[src_index];
     Scalar* dst_base = &dst_data[dst_index];
 
-    using Src = const Eigen::Array<Scalar, Dynamic, 1>;
-    using Dst = Eigen::Array<Scalar, Dynamic, 1>;
+    typedef const Eigen::Array<Scalar, Dynamic, 1> Src;
+    typedef Eigen::Array<Scalar, Dynamic, 1> Dst;
 
-    using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
-    using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
+    typedef Eigen::Map<Src, 0, InnerStride<>> SrcMap;
+    typedef Eigen::Map<Dst, 0, InnerStride<>> DstMap;
 
     const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
     DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
@@ -401,13 +401,13 @@ struct TensorBlockCwiseBinaryOp {
       const StorageIndex left_stride, const LeftScalar* left_data,
       const StorageIndex right_index, const StorageIndex right_stride,
       const RightScalar* right_data) {
-    using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
-    using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
-    using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
+    typedef const Eigen::Array<LeftScalar, Dynamic, 1> Lhs;
+    typedef const Eigen::Array<RightScalar, Dynamic, 1> Rhs;
+    typedef Eigen::Array<OutputScalar, Dynamic, 1> Out;
 
-    using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
-    using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
-    using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
+    typedef Eigen::Map<Lhs, 0, InnerStride<>> LhsMap;
+    typedef Eigen::Map<Rhs, 0, InnerStride<>> RhsMap;
+    typedef Eigen::Map<Out, 0, InnerStride<>> OutMap;
 
     const LeftScalar* lhs_base = &left_data[left_index];
     const RightScalar* rhs_base = &right_data[right_index];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index cca14aafd..a4d750885 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -115,16 +115,21 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // Block based access to the XprType (input) tensor.
-  using TensorBlock                = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockReader          = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockReader;
+
   // We do block based broadcasting using a trick with 2x tensor rank and 0
   // strides. See block method implementation for details.
-  using BroadcastDimensions        = DSizes<Index, 2 * NumDims>;
-  using BroadcastTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
-  using BroadcastTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>;
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+  typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
+      BroadcastTensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
+      BroadcastTensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index aca2ead12..b4c4162ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,10 +152,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      OutputTensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
@@ -426,10 +428,12 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      OutputTensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 17008917a..f11241f83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -36,7 +36,7 @@ template <typename Expression, typename Device, bool Vectorizable,
           bool Tileable>
 class TensorExecutor {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
@@ -60,7 +60,7 @@ template <typename Expression>
 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
                      /*Tileable*/ false> {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
@@ -102,21 +102,22 @@ template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
                      /*Tileable*/ true> {
  public:
-  using Scalar = typename traits<Expression>::Scalar;
-  using ScalarNoConst = typename remove_const<Scalar>::type;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
-  using StorageIndex = typename traits<Expression>::Index;
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
 
   static const int NumDims = traits<Expression>::NumDimensions;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
-    using TensorBlock =
-        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
-                                                NumDims, Evaluator::Layout>;
+    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>
+        TensorBlock;
+    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
+                              Evaluator::Layout>
+        TensorBlockMapper;
 
     Evaluator evaluator(expr, device);
     Index total_size = array_prod(evaluator.dimensions());
@@ -221,7 +222,7 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
@@ -249,20 +250,21 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
  public:
-  using Scalar = typename traits<Expression>::Scalar;
-  using ScalarNoConst = typename remove_const<Scalar>::type;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
-  using StorageIndex = typename traits<Expression>::Index;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
 
   static const int NumDims = traits<Expression>::NumDimensions;
 
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
-    using TensorBlock =
-        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper =
-        TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>
+        TensorBlock;
+    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
+                              Evaluator::Layout>
+        TensorBlockMapper;
 
     Evaluator evaluator(expr, device);
     StorageIndex total_size = array_prod(evaluator.dimensions());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index a8247be90..2b2f4a650 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -252,7 +252,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     RawAccess    = false
   };
 
-  using OutputTensorBlock = internal::TensorBlock<Scalar, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
+      OutputTensorBlock;
 
 #ifdef __SYCL_DEVICE_ONLY__
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
@@ -557,8 +558,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
       OutputTensorBlock* output_block) const {
-    using ImagePatchCopyOp = internal::ImagePatchCopyOp<Self, PacketAccess>;
-    using ImagePatchPaddingOp = internal::ImagePatchPaddingOp<Self>;
+    typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
+    typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
 
     // Calculate loop limits and various input/output dim sizes.
     const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddded0bd..d5b0c1237 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -123,11 +123,15 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     RawAccess    = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock        = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
-  using OutputTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+      OutputTensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
+                                      Layout>
+      OutputTensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
@@ -512,9 +516,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock =  internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
@@ -787,9 +792,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     RawAccess    = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 73675e7dd..80f179ba4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -483,10 +483,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     RawAccess    = false
   };
 
-  using ScalarNoConst =  typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
-  using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+      OutputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
@@ -901,9 +903,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           new (&reducers[i]) BlockReducer(m_reducer);
         }
 
-        using TensorSliceBlockMapper =
-            internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
-                                             Layout>;
+        typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
+                                                 NumInputDims, Layout>
+            TensorSliceBlockMapper;
 
         // TODO(andydavis) Consider removing 'input_block_stride_order' if we
         // find that scattered reads are not worth supporting in
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index f94c1380d..fbe69aabc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -119,10 +119,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
@@ -417,10 +419,12 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockWriter = internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockWriter;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)

From cfaedb38cd662def3b5684a20965b3bc1b0d6a3f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 9 Aug 2018 09:44:07 -0700
Subject: [PATCH 05/10] Fix bug in a test + compilation errors

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 6 ++++--
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 8 ++++----
 unsupported/test/cxx11_tensor_executor.cpp          | 2 +-
 unsupported/test/cxx11_tensor_shuffling.cpp         | 8 ++++----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f11241f83..4bc18bd97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -115,6 +115,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
                          const DefaultDevice& device = DefaultDevice()) {
     typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>
         TensorBlock;
+    typedef typename TensorBlock::Dimensions TensorBlockDimensions;
     typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
                               Evaluator::Layout>
         TensorBlockMapper;
@@ -141,8 +142,9 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
       evaluator.getResourceRequirements(&resources);
       MergeResourceRequirements(resources, &block_shape, &block_total_size);
 
-      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
-                                     block_total_size);
+      TensorBlockMapper block_mapper(
+          TensorBlockDimensions(evaluator.dimensions()), block_shape,
+          block_total_size);
       block_total_size = block_mapper.block_dims_total_size();
 
       Scalar* data = static_cast<Scalar*>(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d5b0c1237..b25c1eabc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -520,6 +520,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
       TensorBlock;
+  typedef typename TensorBlock::Dimensions TensorBlockDimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
@@ -687,7 +688,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
                             output_block->block_sizes(),
                             output_block->block_strides(),
-                            Dimensions(m_inputStrides),
+                            TensorBlockDimensions(m_inputStrides),
                             output_block->data());
     m_impl.block(&input_block);
   }
@@ -796,6 +797,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
 
   typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
       TensorBlock;
+  typedef typename TensorBlock::Dimensions TensorBlockDimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
@@ -862,13 +864,11 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       const TensorBlock& block) {
     this->m_impl.writeBlock(TensorBlock(
         this->srcCoeff(block.first_coeff_index()), block.block_sizes(),
-        block.block_strides(), Dimensions(this->m_inputStrides),
+        block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
         const_cast<ScalarNoConst*>(block.data())));
   }
 };
 
-
-
 namespace internal {
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 448f47f1d..d16ae4d11 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -317,7 +317,7 @@ static void test_execute_reshape(Device d)
 
   DSizes<Index, ReshapedDims> reshaped_dims;
   reshaped_dims[shuffle[0]] = dims[0] * dims[1];
-  for (int i = 2; i < NumDims; ++i) reshaped_dims[shuffle[i]] = dims[i];
+  for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
 
   Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
 
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index ab19b6e6b..467df39c7 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -83,10 +83,10 @@ static void test_expr_shuffling()
 
   Tensor<float, 4, DataLayout> result(5,7,3,2);
 
-  array<int, 4> src_slice_dim{{2,3,1,7}};
-  array<int, 4> src_slice_start{{0,0,0,0}};
-  array<int, 4> dst_slice_dim{{1,7,3,2}};
-  array<int, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
+  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{1,7,3,2}};
+  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =

From f2209d06e428e0691de71f30fc2db4cb29191cd2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 10 Aug 2018 16:53:36 -0700
Subject: [PATCH 06/10] Add block evaluationto CwiseUnaryOp and add
 PreferBlockAccess enum to all evaluators

---
 .../Eigen/CXX11/src/Tensor/TensorArgMax.h     |   2 +
 .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h |   1 +
 .../Eigen/CXX11/src/Tensor/TensorAssign.h     |  18 ++-
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      | 141 ++++++++++++++++++
 .../CXX11/src/Tensor/TensorBroadcasting.h     |  11 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   |  13 +-
 .../CXX11/src/Tensor/TensorConcatenation.h    |   2 +
 .../CXX11/src/Tensor/TensorContraction.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorConversion.h |   1 +
 .../CXX11/src/Tensor/TensorConvolution.h      |   2 +
 .../CXX11/src/Tensor/TensorConvolutionSycl.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorCustomOp.h   |   2 +
 .../Eigen/CXX11/src/Tensor/TensorEvalTo.h     |   1 +
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |  76 +++++++---
 .../Eigen/CXX11/src/Tensor/TensorFFT.h        |   1 +
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h |   1 +
 .../src/Tensor/TensorForwardDeclarations.h    |   7 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h |  13 +-
 .../Eigen/CXX11/src/Tensor/TensorInflation.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h |   2 +
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   |  46 +++---
 .../Eigen/CXX11/src/Tensor/TensorPadding.h    |   1 +
 .../Eigen/CXX11/src/Tensor/TensorPatch.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  |  13 +-
 .../Eigen/CXX11/src/Tensor/TensorRef.h        |   3 +
 .../Eigen/CXX11/src/Tensor/TensorReverse.h    |   2 +
 .../Eigen/CXX11/src/Tensor/TensorScan.h       |   1 +
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  |  24 +--
 .../Eigen/CXX11/src/Tensor/TensorStriding.h   |   2 +
 .../Eigen/CXX11/src/Tensor/TensorTrace.h      |   1 +
 .../CXX11/src/Tensor/TensorVolumePatch.h      |   1 +
 .../test/cxx11_tensor_block_access.cpp        | 111 ++++++++++++++
 unsupported/test/cxx11_tensor_executor.cpp    |  31 ++++
 35 files changed, 454 insertions(+), 82 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index c0f33ba2d..ea3ab329d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -87,6 +87,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -220,6 +221,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
index 442639868..5110e99ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -109,6 +109,7 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
     IsAligned =  false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index f1f877c16..9ec1ec726 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -102,14 +102,16 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                   TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                   TensorEvaluator<RightArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess    = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                        TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
   typedef typename internal::TensorBlock<
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 877603421..4a3e1ac17 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -381,6 +381,147 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
   }
 };
 
+/**
+ * \class TensorBlockCwiseUnaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Carries out a cwise binary op on a number of coefficients.
+ *
+ * This class reads strided input from the argument, and writes the
+ * result of the cwise unary op to the strided output array.
+ *
+ */
+struct TensorBlockCwiseUnaryOp {
+  template <typename StorageIndex, typename UnaryFunctor,
+            typename OutputScalar, typename InputScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const UnaryFunctor& functor, const StorageIndex num_coeff,
+      const StorageIndex output_index, const StorageIndex output_stride,
+      OutputScalar* output_data, const StorageIndex input_index,
+      const StorageIndex input_stride, const InputScalar* input_data) {
+    typedef const Eigen::Array<InputScalar, Dynamic, 1> Input;
+    typedef Eigen::Array<OutputScalar, Dynamic, 1> Output;
+
+    typedef Eigen::Map<Input, 0, InnerStride<>> InputMap;
+    typedef Eigen::Map<Output, 0, InnerStride<>> OutputMap;
+
+    const InputScalar* input_base = &input_data[input_index];
+    OutputScalar* output_base = &output_data[output_index];
+
+    const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
+    OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
+
+    output = Eigen::CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
+  }
+};
+
+/**
+ * \class TensorBlockCwiseUnaryIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class for carrying out cwise unary ops.
+ *
+ * This class carries out the unary op on given blocks.
+ */
+template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
+          int NumDims, int Layout>
+struct TensorBlockCwiseUnaryIO {
+  typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
+                                         Layout>::Dimensions Dimensions;
+
+  struct BlockIteratorState {
+    StorageIndex output_stride, output_span;
+    StorageIndex input_stride, input_span;
+    StorageIndex size, count;
+  };
+
+  template <typename InputScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const UnaryFunctor& functor, const Dimensions& block_sizes,
+      const Dimensions& block_strides, OutputScalar* output_data,
+      const array<StorageIndex, NumDims>& input_strides,
+      const InputScalar* input_data) {
+    // Find the innermost dimension whose size is not 1. This is the effective
+    // inner dim. If all dimensions are of size 1, fallback to using the actual
+    // innermost dim to avoid out-of-bound access.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      if (block_sizes[dim] != 1) {
+        num_size_one_inner_dims = i;
+        break;
+      }
+    }
+    // Calculate strides and dimensions.
+    const int inner_dim =
+        NumDims == 0 ? 1
+                     : cond<Layout>()(num_size_one_inner_dims,
+                                      NumDims - num_size_one_inner_dims - 1);
+    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      // Merge multiple inner dims into one for larger inner dim size (i.e.
+      // fewer calls to TensorBlockCwiseUnaryOp::Run()).
+      if (inner_dim_size == block_strides[dim] &&
+          block_strides[dim] == input_strides[dim]) {
+        inner_dim_size *= block_sizes[dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    StorageIndex output_index = 0, input_index = 0;
+
+    const StorageIndex output_stride =
+        NumDims == 0 ? 1 : block_strides[inner_dim];
+    const StorageIndex input_stride =
+        NumDims == 0 ? 1 : input_strides[inner_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int num_squeezed_dims = 0;
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+      const StorageIndex size = block_sizes[dim];
+      if (size == 1) {
+        continue;
+      }
+      BlockIteratorState& state = block_iter_state[num_squeezed_dims];
+      state.output_stride = block_strides[dim];
+      state.input_stride = input_strides[dim];
+      state.size = size;
+      state.output_span = state.output_stride * (size - 1);
+      state.input_span = state.input_stride * (size - 1);
+      state.count = 0;
+      ++num_squeezed_dims;
+    }
+
+    // Compute cwise unary op.
+    const StorageIndex block_total_size =
+        NumDims == 0 ? 1 : block_sizes.TotalSize();
+    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
+      TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index,
+                                   output_stride, output_data, input_index,
+                                   input_stride, input_data);
+      // Update index.
+      for (int j = 0; j < num_squeezed_dims; ++j) {
+        auto& state = block_iter_state[j];
+        if (++state.count < state.size) {
+          output_index += state.output_stride;
+          input_index += state.input_stride;
+          break;
+        }
+        state.count = 0;
+        output_index -= state.output_span;
+        input_index -= state.input_span;
+      }
+    }
+  }
+};
+
 /**
  * \class TensorBlockCwiseBinaryOp
  * \ingroup CXX11_Tensor_Module
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a4d750885..5e812b04d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -108,11 +108,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   bool isCopy= false, nByOne = false, oneByN = false;
 
   enum {
-    IsAligned    = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess    = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index b4c4162ef..76fab39e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -144,12 +144,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned    = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 27c92d8f6..3863ee8c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
@@ -308,6 +309,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e604456e8..c459fc649 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -241,6 +241,7 @@ struct TensorContractionEvaluatorBase
     IsAligned = true,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a7751eee1..1f613d3c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -196,6 +196,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index a07e32db0..2d0e6599f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -308,6 +308,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -780,6 +781,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index d301d0c01..e79958fc9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -243,6 +243,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 47b5a5a5e..d71b2e34b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -93,6 +93,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -255,6 +256,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 256d499f2..554ee5f59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -108,6 +108,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 028902fea..e30f9ad8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,6 +43,7 @@ struct TensorEvaluator
     IsAligned = Derived::IsAligned,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess = false,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
@@ -195,6 +196,7 @@ struct TensorEvaluator<const Derived, Device>
     IsAligned = Derived::IsAligned,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess = false,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
@@ -288,6 +290,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     IsAligned = true,
     PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -351,27 +354,34 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
-                   internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
+                         internal::functor_traits<UnaryOp>::PacketAccess,
+    BlockAccess        = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout             = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess        = false,  // to be implemented
+    RawAccess          = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+    : m_device(device),
+      m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
+  static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -399,6 +409,29 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_argImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (NumDims <= 0) {
+      output_block->data()[0] = coeff(0);
+      return;
+    }
+    internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
+                                                         *output_block);
+    internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
+                                      Layout>::Run(m_functor,
+                                                   output_block->block_sizes(),
+                                                   output_block
+                                                       ->block_strides(),
+                                                   output_block->data(),
+                                                   arg_block.block_strides(),
+                                                   arg_block.data());
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
 
   /// required by sycl in order to extract the accessor
@@ -408,6 +441,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
 
  private:
+  const Device& m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
@@ -421,16 +455,18 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                   TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess &
-                   internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                   TensorEvaluator<RightArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                        TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                        TensorEvaluator<RightArgType, Device>::PacketAccess &
+                        internal::functor_traits<BinaryOp>::PacketAccess,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -501,7 +537,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
       TensorBlock* output_block) const {
     if (NumDims <= 0) {
-      output_block->data()[0] = coeff(0);
+      output_block->data()[0] = coeff(Index(0));
       return;
     }
     internal::TensorBlockView<LeftArgType, Device> left_block(
@@ -543,6 +579,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
                    internal::functor_traits<TernaryOp>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<Arg1Type, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -648,6 +685,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
                     PacketType<Scalar, Device>::HasBlend,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index d6ab4d997..480cf1f39 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -136,6 +136,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 1342e47a6..71ba56773 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 2778bf5ec..edf9f85d8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -99,6 +99,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     IsAligned = true,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = true
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 0dd524a30..b8a57ab99 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -132,7 +132,12 @@ struct IsVectorizable<GpuDevice, Expression> {
 
 template <typename Device, typename Expression>
 struct IsTileable {
-  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static const bool value =
+      TensorEvaluator<Expression, Device>::BlockAccess &
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
 };
 
 template <typename Expression, typename Device,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 97c8d4a02..95c9e6aee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 2b2f4a650..1826d7022 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -244,12 +244,13 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = true,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
   typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 64f2ad81f..e28565009 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -91,6 +91,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index e3165fa10..998757d14 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -120,6 +120,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -183,6 +184,7 @@ template<typename ArgType, typename Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index b25c1eabc..6f9294ccf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -111,16 +111,17 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
 
   enum {
-    IsAligned    = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
     // performance issue with block-based reshape is resolved.
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess &&
-                   TensorEvaluator<ArgType, Device>::RawAccess &&
-                   NumInputDims > 0 && NumOutputDims > 0,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = TensorEvaluator<ArgType, Device>::RawAccess
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -349,6 +350,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -508,12 +510,13 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -785,12 +788,13 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef Sizes Dimensions;
 
   enum {
-    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -972,6 +976,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
@@ -1148,6 +1153,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index aa1db3c73..59c1704ed 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a0a1ad8f4..4292fe0c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 80f179ba4..3d534eaa2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -475,12 +475,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = Self::InputPacketAccess && Op::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index a6cade50f..6e15e75f9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -137,6 +137,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       IsAligned = false,
       PacketAccess = false,
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
@@ -366,6 +367,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -414,6 +416,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 9193bdd8e..b7fb969f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -114,6 +114,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -255,6 +256,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index b1135f297..641366d9d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 98f125408..a5b541a68 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -111,12 +111,13 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -412,11 +413,12 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 4b69072f2..221dc96c9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -275,6 +276,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index ea53bb04b..9fc54a4c0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 3c7d8bbc0..c1b7a58ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -200,6 +200,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 6feeff231..746f62511 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -517,6 +517,114 @@ static void test_block_io_squeeze_ones() {
   }
 }
 
+template <typename T, int NumDims, int Layout>
+static void test_block_cwise_unary_io_basic() {
+  typedef internal::scalar_square_op<T> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  T* input_data = GenerateRandomData<T>(total_size);
+
+  T* output_data = new T[total_size];
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+                               strides, input_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_squeeze_ones() {
+  typedef internal::scalar_square_op<float> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
+  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  auto* input_data = GenerateRandomData<float>(total_size);
+
+  auto* output_data = new float[total_size];
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+                               strides, input_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_zero_strides() {
+  typedef internal::scalar_square_op<float> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+  DSizes<Index, 5> input_sizes = rnd_dims;
+  input_sizes[0] = 1;
+  input_sizes[2] = 1;
+  input_sizes[4] = 1;
+
+  DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
+  input_strides[0] = 0;
+  input_strides[2] = 0;
+  input_strides[4] = 0;
+
+  // Generate random data.
+  auto* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
+
+  DSizes<Index, 5> output_sizes = rnd_dims;
+  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
+
+  const auto output_total_size = output_sizes.TotalSize();
+  auto* output_data = new float[output_total_size];
+
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
+                               output_data, input_strides, input_data);
+  for (int i = 0; i < rnd_dims[0]; ++i) {
+    for (int j = 0; j < rnd_dims[1]; ++j) {
+      for (int k = 0; k < rnd_dims[2]; ++k) {
+        for (int l = 0; l < rnd_dims[3]; ++l) {
+          for (int m = 0; m < rnd_dims[4]; ++m) {
+            Index output_index = i * output_strides[0] + j * output_strides[1] +
+                                 k * output_strides[2] + l * output_strides[3] +
+                                 m * output_strides[4];
+            Index input_index = i * input_strides[0] + j * input_strides[1] +
+                                k * input_strides[2] + l * input_strides[3] +
+                                m * input_strides[4];
+            VERIFY_IS_EQUAL(output_data[output_index],
+                            functor(input_data[input_index]));
+          }
+        }
+      }
+    }
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
 template <typename T, int NumDims, int Layout>
 static void test_block_cwise_binary_io_basic() {
   typedef internal::scalar_sum_op<T> BinaryFunctor;
@@ -982,6 +1090,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
   TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
   TEST_LAYOUTS(test_block_io_zero_stride);
   TEST_LAYOUTS(test_block_io_squeeze_ones);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
+  TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
+  TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
   TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
   TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
   TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index d16ae4d11..1dc18220c 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -29,6 +29,33 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
   return dims;
 };
 
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_unary_expr(Device d) {
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
 static void test_execute_binary_expr(Device d)
@@ -445,6 +472,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   Eigen::ThreadPool tp(num_threads);
   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
 
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5);
+
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3);
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4);
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5);

From 35d90e89600ff2524ec8bdd4ef4b95dd7c78b656 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 10 Aug 2018 17:37:58 -0700
Subject: [PATCH 07/10] Fix BlockAccess enum in CwiseUnaryOp evaluator

---
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h         | 2 +-
 .../Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h       | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index e30f9ad8e..4ca6b3d8c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -357,7 +357,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
                          internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess        = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout             = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess        = false,  // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index b8a57ab99..b7a0193fe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -135,9 +135,8 @@ struct IsTileable {
   // Check that block evaluation is supported and it's a preferred option (at
   // least one sub-expression has much faster block evaluation, e.g.
   // broadcasting).
-  static const bool value =
-      TensorEvaluator<Expression, Device>::BlockAccess &
-      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
+                            TensorEvaluator<Expression, Device>::PreferBlockAccess;
 };
 
 template <typename Expression, typename Device,

From 81b38a155adf5d527bce5c84cf90cd83c28da445 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 11 Sep 2018 13:32:32 -0700
Subject: [PATCH 08/10] Fix compilation of tiled evaluation code with c++03

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   |  6 +++---
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    | 11 +++++------
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |  5 ++---
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h      |  5 ++---
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |  5 ++---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |  5 ++---
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |  4 ++--
 unsupported/test/cxx11_tensor_block_access.cpp     | 14 +++++++-------
 unsupported/test/cxx11_tensor_shuffling.cpp        | 10 +++++-----
 9 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 6d90af2d3..13da36257 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -396,8 +396,8 @@ struct TensorBlockCwiseUnaryOp {
     typedef const Eigen::Array<InputScalar, Dynamic, 1> Input;
     typedef Eigen::Array<OutputScalar, Dynamic, 1> Output;
 
-    typedef Eigen::Map<Input, 0, InnerStride<>> InputMap;
-    typedef Eigen::Map<Output, 0, InnerStride<>> OutputMap;
+    typedef Eigen::Map<Input, 0, InnerStride<> > InputMap;
+    typedef Eigen::Map<Output, 0, InnerStride<> > OutputMap;
 
     const InputScalar* input_base = &input_data[input_index];
     OutputScalar* output_base = &output_data[output_index];
@@ -502,7 +502,7 @@ struct TensorBlockCwiseUnaryIO {
                                    input_stride, input_data);
       // Update index.
       for (int j = 0; j < num_squeezed_dims; ++j) {
-        auto& state = block_iter_state[j];
+        BlockIteratorState& state = block_iter_state[j];
         if (++state.count < state.size) {
           output_index += state.output_stride;
           input_index += state.input_stride;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 02d061a9c..e5cf93ab0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -596,12 +596,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
 
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
 
     m_impl.getResourceRequirements(resources);
   }
@@ -617,8 +616,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
     // for the first outer dim, and 1 for other outer dims. This is guaranteed
     // by MergeResourceRequirements() in TensorBlock.h.
-    const auto& output_block_sizes = output_block->block_sizes();
-    const auto& output_block_strides = output_block->block_strides();
+    const Dimensions& output_block_sizes = output_block->block_sizes();
+    const Dimensions& output_block_strides = output_block->block_strides();
 
     // Find where outer dims start.
     int outer_dim_start = 0;
@@ -642,7 +641,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       return;
     }
 
-    const auto& input_dims = m_impl.dimensions();
+    const Dimensions& input_dims = m_impl.dimensions();
 
     // Pre-fill input_block_sizes, broadcast_block_sizes,
     // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 76fab39e2..b47fa9e8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -290,11 +290,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 1826d7022..965bd8f1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -550,11 +550,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 2f765acb7..16dc74afe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -677,11 +677,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 3d534eaa2..eeb2578fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -771,11 +771,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index a5b541a68..e018d0ab2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -229,10 +229,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max));
+        internal::kUniformAllDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index eec282ba7..ad12ae557 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -535,7 +535,7 @@ static void test_block_cwise_unary_io_basic() {
   DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
   DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
 
-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();
 
   // Create a random input tensors.
   T* input_data = GenerateRandomData<T>(total_size);
@@ -562,12 +562,12 @@ static void test_block_cwise_unary_io_squeeze_ones() {
   DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
   DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
 
-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();
 
   // Create a random input tensors.
-  auto* input_data = GenerateRandomData<float>(total_size);
+  float* input_data = GenerateRandomData<float>(total_size);
 
-  auto* output_data = new float[total_size];
+  float* output_data = new float[total_size];
   UnaryFunctor functor;
   TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
                                strides, input_data);
@@ -599,13 +599,13 @@ static void test_block_cwise_unary_io_zero_strides() {
   input_strides[4] = 0;
 
   // Generate random data.
-  auto* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
+  float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
 
   DSizes<Index, 5> output_sizes = rnd_dims;
   DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
 
-  const auto output_total_size = output_sizes.TotalSize();
-  auto* output_data = new float[output_total_size];
+  const Index output_total_size = output_sizes.TotalSize();
+  float* output_data = new float[output_total_size];
 
   UnaryFunctor functor;
   TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index 467df39c7..062dd1c0f 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
   Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4, DataLayout> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
 
-  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
-  array<ptrdiff_t, 4> dst_slice_dim{{1,7,3,2}};
-  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim({2, 3, 1, 7});
+  array<ptrdiff_t, 4> src_slice_start({0, 0, 0, 0});
+  array<ptrdiff_t, 4> dst_slice_dim({1, 7, 3, 2});
+  array<ptrdiff_t, 4> dst_slice_start({0, 0, 0, 0});
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =

From d138fe341dab4b7c1a7b7d012a3e13e1796aad8f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 11 Sep 2018 17:23:18 -0700
Subject: [PATCH 09/10] Fis static_assert in test to conform c++11 standard

---
 unsupported/test/cxx11_tensor_executor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 1dc18220c..8639e7b38 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -286,7 +286,7 @@ template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
 static void test_execute_reduction(Device d)
 {
-  static_assert(NumDims >= 2);
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
 
   static constexpr int ReducedDims = NumDims - 2;
   static constexpr int Options = 0 | Layout;
@@ -328,7 +328,7 @@ template <typename T, int NumDims, typename Device, bool Vectorizable,
     bool Tileable, int Layout>
 static void test_execute_reshape(Device d)
 {
-  static_assert(NumDims >= 2);
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
 
   static constexpr int ReshapedDims = NumDims - 1;
   static constexpr int Options = 0 | Layout;
@@ -368,7 +368,7 @@ template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
 static void test_execute_slice_rvalue(Device d)
 {
-  static_assert(NumDims >= 2);
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(5, 10);
@@ -408,7 +408,7 @@ template <typename T, int NumDims, typename Device, bool Vectorizable,
     bool Tileable, int Layout>
 static void test_execute_slice_lvalue(Device d)
 {
-  static_assert(NumDims >= 2);
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
   static constexpr int Options = 0 | Layout;
 
   auto dims = RandomDims<NumDims>(5, 10);

From 01197e44527941c95f9a63e4f60ab3a989f12cbe Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 13 Sep 2018 15:03:36 -0700
Subject: [PATCH 10/10] Fix warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 --
 unsupported/test/cxx11_tensor_executor.cpp           | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index eeb2578fd..261dbffa8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -1140,7 +1140,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       const Index max_coeff_count,
       const DSizes<Index, NumInputDims>& input_slice_sizes,
       DSizes<Index, NumInputDims>* target_input_block_sizes) const {
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef internal::BlockReducer<Self, Op> BlockReducer;
     // TODO(andydavis) Compute reducer overhead correctly for the case where
     // we are preserving the inner most dimension, and a single reducer
@@ -1168,7 +1167,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       } else if (!first_preserved_dim_allocated) {
         // TODO(andydavis) Include output block size in this L1 working set
         // calculation.
-        const Index allocated = max_coeff_count - coeff_to_allocate;
         const Index alloc_size = numext::maxi(
             static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
         (*target_input_block_sizes)[dim] =
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 8639e7b38..1bb99854c 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -27,7 +27,7 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
     dims[i] = internal::random<int>(min_dim, max_dim);
   }
   return dims;
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
@@ -117,7 +117,7 @@ static void test_execute_broadcasting(Device d)
   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
   }
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
@@ -155,7 +155,7 @@ static void test_execute_chipping_rvalue(Device d) {
   TEST_CHIPPING(5)
 
 #undef TEST_CHIPPING
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
     bool Tileable, int Layout>
@@ -207,7 +207,7 @@ static void test_execute_chipping_lvalue(Device d) {
   TEST_CHIPPING(5)
 
 #undef TEST_CHIPPING
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>