Remove legacy block evaluation support

2025-10-09 14:46:31 +08:00 · 2019-11-12 10:12:28 -08:00 · 2019-11-12 10:12:28 -08:00 · 13c3327f5c
commit 13c3327f5c
parent 71aa53dd6d
36 changed files with 62 additions and 3224 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@ -88,7 +88,6 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -230,7 +229,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  enum {
    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess       = false,
    BlockAccessV2     = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@ -108,7 +108,6 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
  enum {
    IsAligned =  false,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -108,8 +108,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
                        TensorEvaluator<RightArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
@ -216,19 +214,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    m_rightImpl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
-    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
-        m_leftImpl.data() != NULL) {
-      TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
-                             block->tensor_strides(), block->tensor_strides(),
-                             m_leftImpl.data() + block->first_coeff_index());
-      m_rightImpl.block(&left_block);
-    } else {
-      m_rightImpl.block(block);
-      m_leftImpl.writeBlock(*block);
-    }
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@ -142,782 +142,6 @@ class TensorBlock {
  Scalar* m_data;  // Not owned.
 };

-template <typename Scalar, typename StorageIndex>
-struct TensorBlockCopyOp {
-
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize   = packet_traits<Scalar>::size
-  };
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
-      const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const StorageIndex src_index, const StorageIndex src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_index];
-    Scalar* dst = &dst_data[dst_index];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < num_coeff_to_copy; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    if (src_stride == 1) {
-      const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
-      if (dst_stride == 1) {
-        // LINEAR
-        for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
-          Packet p = ploadu<Packet>(src + i);
-          pstoreu<Scalar, Packet>(dst + i, p);
-        }
-        for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst[i] = src[i];
-        }
-      } else {
-        // SCATTER
-        for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
-          Packet p = ploadu<Packet>(src + i);
-          pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-        }
-        for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst[i * dst_stride] = src[i];
-        }
-      }
-    } else if (src_stride == 0) {
-      const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
-      if (dst_stride == 1) {
-        // LINEAR
-        for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
-          Packet p = pload1<Packet>(src);
-          pstoreu<Scalar, Packet>(dst + i, p);
-        }
-        for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst[i] = *src;
-        }
-      } else {
-        // SCATTER
-        for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
-          Packet p = pload1<Packet>(src);
-          pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-        }
-        for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst[i * dst_stride] = *src;
-        }
-      }
-    } else {
-      if (dst_stride == 1) {
-        // GATHER
-        const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
-        for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
-          Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-          pstoreu<Scalar, Packet>(dst + i, p);
-        }
-        for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst[i] = src[i * src_stride];
-        }
-      } else {
-        // RANDOM
-        for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) {
-          dst[i * dst_stride] = src[i * src_stride];
-        }
-      }
-    }
-  }
-};
-
-/**
- * \class TensorBlockIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class.
- *
- * This class is responsible for copying data between a tensor and a tensor
- * block.
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
-          bool BlockRead>
-class TensorBlockIO {
- public:
-  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
-  typedef TensorBlockCopyOp<Scalar, StorageIndex> BlockCopyOp;
-
- protected:
-  typedef array<StorageIndex, NumDims> Dimensions;
-
-  struct BlockIteratorState {
-    StorageIndex input_stride;
-    StorageIndex output_stride;
-    StorageIndex input_span;
-    StorageIndex output_span;
-    StorageIndex size;
-    StorageIndex count;
-    BlockIteratorState()
-        : input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0),
-          size(0),
-          count(0) {}
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between a tensor and a block. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const Dimensions& tensor_to_block_dim_map) {
-    int num_squeezable_dims = 0;
-    if (Layout == ColMajor) {
-      for (int i = 0; i < NumDims; ++i) {
-        if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
-        else break;
-      }
-    } else {
-      for (int i = NumDims - 1; i >= 0; --i) {
-        if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
-        else break;
-      }
-    }
-    return num_squeezable_dims;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
-      const Block& block, StorageIndex first_coeff_index,
-      const Dimensions& tensor_to_block_dim_map,
-      const Dimensions& tensor_strides,
-      const Scalar* src_data,
-      Scalar* dst_data) {
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(tensor_to_block_dim_map);
-
-    // Find the innermost tensor dimension whose size is not 1. This is the
-    // effective inner dim. If all dimensions are of size 1, then fallback to
-    // using the actual innermost dim to avoid out-of-bound access.
-    StorageIndex num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
-      if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
-        num_size_one_inner_dims = i;
-        break;
-      }
-    }
-
-    // Calculate strides and dimensions.
-    const StorageIndex tensor_stride1_dim = cond<Layout>()(
-        num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
-    const StorageIndex block_dim_for_tensor_stride1_dim =
-        NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
-    StorageIndex block_inner_dim_size =
-        NumDims == 0 ? 1
-                     : block.block_sizes()[block_dim_for_tensor_stride1_dim];
-
-    // Squeeze multiple inner dims into one for larger inner dim size.
-    for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const Index dim = cond<Layout>()(i, NumDims - i - 1);
-      const StorageIndex block_stride =
-          block.block_strides()[tensor_to_block_dim_map[dim]];
-      if (block_inner_dim_size == block_stride &&
-          block_stride == tensor_strides[dim]) {
-        block_inner_dim_size *=
-            block.block_sizes()[tensor_to_block_dim_map[dim]];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    StorageIndex inputIndex;
-    StorageIndex outputIndex;
-    StorageIndex input_stride;
-    StorageIndex output_stride;
-
-    // Setup strides to read/write along the tensor's stride1 dimension.
-    if (BlockRead) {
-      inputIndex = first_coeff_index;
-      outputIndex = 0;
-      input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
-      output_stride =
-          NumDims == 0
-              ? 1
-              : block.block_strides()[block_dim_for_tensor_stride1_dim];
-    } else {
-      inputIndex = 0;
-      outputIndex = first_coeff_index;
-      input_stride =
-          NumDims == 0
-              ? 1
-              : block.block_strides()[block_dim_for_tensor_stride1_dim];
-      output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
-    }
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    Index num_squeezed_dims = 0;
-    for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const Index dim = cond<Layout>()(i + 1, NumDims - i - 2);
-      const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
-      if (size == 1) {
-        continue;
-      }
-      block_iter_state[num_squeezed_dims].size = size;
-      if (BlockRead) {
-        block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
-        block_iter_state[num_squeezed_dims].output_stride =
-            block.block_strides()[tensor_to_block_dim_map[dim]];
-      } else {
-        block_iter_state[num_squeezed_dims].input_stride =
-            block.block_strides()[tensor_to_block_dim_map[dim]];
-        block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
-      }
-      block_iter_state[num_squeezed_dims].input_span =
-          block_iter_state[num_squeezed_dims].input_stride *
-          (block_iter_state[num_squeezed_dims].size - 1);
-      block_iter_state[num_squeezed_dims].output_span =
-          block_iter_state[num_squeezed_dims].output_stride *
-          (block_iter_state[num_squeezed_dims].size - 1);
-      ++num_squeezed_dims;
-    }
-
-    // Iterate copying data from src to dst.
-    const StorageIndex block_total_size =
-        NumDims == 0 ? 1 : block.block_sizes().TotalSize();
-    for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
-      BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
-                       dst_data, inputIndex, input_stride, src_data);
-      // Update index.
-      for (int j = 0; j < num_squeezed_dims; ++j) {
-        if (++block_iter_state[j].count < block_iter_state[j].size) {
-          inputIndex += block_iter_state[j].input_stride;
-          outputIndex += block_iter_state[j].output_stride;
-          break;
-        }
-        block_iter_state[j].count = 0;
-        inputIndex -= block_iter_state[j].input_span;
-        outputIndex -= block_iter_state[j].output_span;
-      }
-    }
-  }
-};
-
-/**
- * \class TensorBlockReader
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block reader class.
- *
- * This class is responsible for reading a tensor block.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
-                                               Layout, /*BlockRead=*/true> {
- public:
-  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
-  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> Base;
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      Block* block, const Scalar* src_data) {
-    array<StorageIndex, NumDims> tensor_to_block_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      tensor_to_block_dim_map[i] = i;
-    }
-    Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
-               block->tensor_strides(), src_data, block->data());
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      Block* block, StorageIndex first_coeff_index,
-      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
-      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
-    Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
-               tensor_strides, src_data, block->data());
-  }
-};
-
-/**
- * \class TensorBlockWriter
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block writer class.
- *
- * This class is responsible for writing a tensor block.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
-                                               Layout, /*BlockRead=*/false> {
- public:
-  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
-  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> Base;
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Block& block, Scalar* dst_data) {
-    array<StorageIndex, NumDims> tensor_to_block_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      tensor_to_block_dim_map[i] = i;
-    }
-    Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
-               block.tensor_strides(), block.data(), dst_data);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Block& block, StorageIndex first_coeff_index,
-      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
-      const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
-    Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
-               tensor_strides, block.data(), dst_data);
-  }
-};
-
-/**
- * \class TensorBlockCwiseUnaryOp
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Carries out a cwise binary op on a number of coefficients.
- *
- * This class reads strided input from the argument, and writes the
- * result of the cwise unary op to the strided output array.
- *
- */
-template <bool Vectorizable>
-struct TensorBlockCwiseUnaryOp {
-  template <typename StorageIndex, typename UnaryFunctor,
-            typename OutputScalar, typename InputScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const UnaryFunctor& functor, const StorageIndex num_coeff,
-      const StorageIndex output_index, const StorageIndex output_stride,
-      OutputScalar* output_data, const StorageIndex input_index,
-      const StorageIndex input_stride, const InputScalar* input_data) {
-    typedef const Array<InputScalar, Dynamic, 1> Input;
-    typedef Array<OutputScalar, Dynamic, 1> Output;
-
-    typedef Map<Input, 0, InnerStride<> > InputMap;
-    typedef Map<Output, 0, InnerStride<> > OutputMap;
-
-    const InputScalar* input_base = &input_data[input_index];
-    OutputScalar* output_base = &output_data[output_index];
-
-    const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
-    OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
-
-    output = CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
-  }
-};
-
-template<>
-struct TensorBlockCwiseUnaryOp<true> {
-  template <typename StorageIndex, typename UnaryFunctor,
-            typename OutputScalar, typename InputScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const UnaryFunctor& functor, const StorageIndex num_coeff,
-      const StorageIndex output_index, const StorageIndex output_stride,
-      OutputScalar* output_data, const StorageIndex input_index,
-      const StorageIndex input_stride, const InputScalar* input_data) {
-    if (input_stride == 1 && output_stride == 1) {
-      typedef const Array<InputScalar, Dynamic, 1> Input;
-      typedef Array<OutputScalar, Dynamic, 1> Output;
-
-      const Map<Input> input(&input_data[input_index], num_coeff);
-      Map<Output> output(&output_data[output_index], num_coeff);
-
-      output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
-    } else {
-      TensorBlockCwiseUnaryOp<false>::Run(
-          functor, num_coeff, output_index, output_stride, output_data,
-          input_index, input_stride, input_data);
-    }
-  }
-};
-
-/**
- * \class TensorBlockCwiseUnaryIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class for carrying out cwise unary ops.
- *
- * This class carries out the unary op on given blocks.
- */
-template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
-          int NumDims, int Layout>
-struct TensorBlockCwiseUnaryIO {
-  typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
-                                         Layout>::Dimensions Dimensions;
-
-  typedef TensorBlockCwiseUnaryOp<
-      packet_traits<OutputScalar>::Vectorizable &&
-      functor_traits<UnaryFunctor>::PacketAccess>
-      TensorBlockCwiseUnaryOpImpl;
-
-  struct BlockIteratorState {
-    StorageIndex output_stride, output_span;
-    StorageIndex input_stride, input_span;
-    StorageIndex size, count;
-  };
-
-  template <typename InputScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const UnaryFunctor& functor, const Dimensions& block_sizes,
-      const Dimensions& block_strides, OutputScalar* output_data,
-      const array<StorageIndex, NumDims>& input_strides,
-      const InputScalar* input_data) {
-    // Find the innermost dimension whose size is not 1. This is the effective
-    // inner dim. If all dimensions are of size 1, fallback to using the actual
-    // innermost dim to avoid out-of-bound access.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
-      if (block_sizes[dim] != 1) {
-        num_size_one_inner_dims = i;
-        break;
-      }
-    }
-    // Calculate strides and dimensions.
-    const int inner_dim =
-        NumDims == 0 ? 1
-                     : cond<Layout>()(num_size_one_inner_dims,
-                                      NumDims - num_size_one_inner_dims - 1);
-    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
-    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
-      // Merge multiple inner dims into one for larger inner dim size (i.e.
-      // fewer calls to TensorBlockCwiseUnaryOp::Run()).
-      if (inner_dim_size == block_strides[dim] &&
-          block_strides[dim] == input_strides[dim]) {
-        inner_dim_size *= block_sizes[dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    StorageIndex output_index = 0, input_index = 0;
-
-    const StorageIndex output_stride =
-        NumDims == 0 ? 1 : block_strides[inner_dim];
-    const StorageIndex input_stride =
-        NumDims == 0 ? 1 : input_strides[inner_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int num_squeezed_dims = 0;
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
-      const StorageIndex size = block_sizes[dim];
-      if (size == 1) {
-        continue;
-      }
-      BlockIteratorState& state = block_iter_state[num_squeezed_dims];
-      state.output_stride = block_strides[dim];
-      state.input_stride = input_strides[dim];
-      state.size = size;
-      state.output_span = state.output_stride * (size - 1);
-      state.input_span = state.input_stride * (size - 1);
-      state.count = 0;
-      ++num_squeezed_dims;
-    }
-
-    // Compute cwise unary op.
-    const StorageIndex block_total_size =
-        NumDims == 0 ? 1 : block_sizes.TotalSize();
-    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
-      TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
-                                       output_stride, output_data, input_index,
-                                       input_stride, input_data);
-      // Update index.
-      for (int j = 0; j < num_squeezed_dims; ++j) {
-        BlockIteratorState& state = block_iter_state[j];
-        if (++state.count < state.size) {
-          output_index += state.output_stride;
-          input_index += state.input_stride;
-          break;
-        }
-        state.count = 0;
-        output_index -= state.output_span;
-        input_index -= state.input_span;
-      }
-    }
-  }
-};
-
-/**
- * \class TensorBlockCwiseBinaryOp
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Carries out a cwise binary op on a number of coefficients.
- *
- * This class reads strided inputs from left and right operands, and writes the
- * result of the cwise binary op to the strided output array.
- *
- */
-template<bool Vectorizable>
-struct TensorBlockCwiseBinaryOp {
-  template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
-            typename LeftScalar, typename RightScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const BinaryFunctor& functor, const StorageIndex num_coeff,
-      const StorageIndex output_index, const StorageIndex output_stride,
-      OutputScalar* output_data, const StorageIndex left_index,
-      const StorageIndex left_stride, const LeftScalar* left_data,
-      const StorageIndex right_index, const StorageIndex right_stride,
-      const RightScalar* right_data) {
-    typedef const Array<LeftScalar, Dynamic, 1> Lhs;
-    typedef const Array<RightScalar, Dynamic, 1> Rhs;
-    typedef Array<OutputScalar, Dynamic, 1> Out;
-
-    typedef Map<Lhs, 0, InnerStride<> > LhsMap;
-    typedef Map<Rhs, 0, InnerStride<> > RhsMap;
-    typedef Map<Out, 0, InnerStride<> > OutMap;
-
-    const LeftScalar* lhs_base = &left_data[left_index];
-    const RightScalar* rhs_base = &right_data[right_index];
-    OutputScalar* out_base = &output_data[output_index];
-
-    const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
-    const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
-    OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
-
-    out = CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
-  }
-};
-
-template<>
-struct TensorBlockCwiseBinaryOp<true> {
-  template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
-            typename LeftScalar, typename RightScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const BinaryFunctor& functor, const StorageIndex num_coeff,
-      const StorageIndex output_index, const StorageIndex output_stride,
-      OutputScalar* output_data, const StorageIndex left_index,
-      const StorageIndex left_stride, const LeftScalar* left_data,
-      const StorageIndex right_index, const StorageIndex right_stride,
-      const RightScalar* right_data) {
-    if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
-      typedef const Array<LeftScalar, Dynamic, 1> Lhs;
-      typedef const Array<RightScalar, Dynamic, 1> Rhs;
-      typedef Array<OutputScalar, Dynamic, 1> Out;
-
-      const LeftScalar* lhs_base = &left_data[left_index];
-      const RightScalar* rhs_base = &right_data[right_index];
-      OutputScalar* out_base = &output_data[output_index];
-
-      const Map<Lhs> lhs(lhs_base, num_coeff);
-      const Map<Rhs> rhs(rhs_base, num_coeff);
-      Map<Out> out(out_base, num_coeff);
-
-      out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
-    } else {
-      TensorBlockCwiseBinaryOp<false>::Run(
-          functor, num_coeff, output_index, output_stride, output_data,
-          left_index, left_stride, left_data, right_index, right_stride,
-          right_data);
-    }
-  }
-};
-
-/**
- * \class TensorBlockCwiseBinaryIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class for carrying out cwise binary ops.
- *
- * This class carries out the binary op on given blocks.
- *
- */
-template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
-          int NumDims, int Layout>
-struct TensorBlockCwiseBinaryIO {
-  typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
-
-  typedef TensorBlockCwiseBinaryOp<
-      packet_traits<OutputScalar>::Vectorizable &&
-      functor_traits<BinaryFunctor>::PacketAccess>
-      TensorBlockCwiseBinaryOpImpl;
-
-  struct BlockIteratorState {
-    StorageIndex output_stride, output_span;
-    StorageIndex left_stride, left_span;
-    StorageIndex right_stride, right_span;
-    StorageIndex size, count;
-  };
-
-  template <typename LeftScalar, typename RightScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const BinaryFunctor& functor, const Dimensions& block_sizes,
-      const Dimensions& block_strides, OutputScalar* output_data,
-      const array<StorageIndex, NumDims>& left_strides,
-      const LeftScalar* left_data,
-      const array<StorageIndex, NumDims>& right_strides,
-      const RightScalar* right_data) {
-    // Find the innermost dimension whose size is not 1. This is the effective
-    // inner dim. If all dimensions are of size 1, fallback to using the actual
-    // innermost dim to avoid out-of-bound access.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
-      if (block_sizes[dim] != 1) {
-        num_size_one_inner_dims = i;
-        break;
-      }
-    }
-    // Calculate strides and dimensions.
-    const int inner_dim =
-        NumDims == 0 ? 1
-                     : cond<Layout>()(num_size_one_inner_dims,
-                                      NumDims - num_size_one_inner_dims - 1);
-    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
-    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
-      const int dim = cond<Layout>()(i, NumDims - i - 1);
-      // Merge multiple inner dims into one for larger inner dim size (i.e.
-      // fewer calls to TensorBlockCwiseBinaryOp::Run()).
-      if (inner_dim_size == block_strides[dim] &&
-          block_strides[dim] == left_strides[dim] &&
-          block_strides[dim] == right_strides[dim]) {
-        inner_dim_size *= block_sizes[dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    StorageIndex output_index = 0, left_index = 0, right_index = 0;
-    const StorageIndex output_stride =
-        NumDims == 0 ? 1 : block_strides[inner_dim];
-    const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
-    const StorageIndex right_stride =
-        NumDims == 0 ? 1 : right_strides[inner_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int num_squeezed_dims = 0;
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
-      const StorageIndex size = block_sizes[dim];
-      if (size == 1) {
-        continue;
-      }
-      BlockIteratorState& state = block_iter_state[num_squeezed_dims];
-      state.output_stride = block_strides[dim];
-      state.left_stride = left_strides[dim];
-      state.right_stride = right_strides[dim];
-      state.size = size;
-      state.output_span = state.output_stride * (size - 1);
-      state.left_span = state.left_stride * (size - 1);
-      state.right_span = state.right_stride * (size - 1);
-      state.count = 0;
-      ++num_squeezed_dims;
-    }
-
-    // Compute cwise binary op.
-    const StorageIndex block_total_size =
-        NumDims == 0 ? 1 : block_sizes.TotalSize();
-    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
-      TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
-                                        output_stride, output_data, left_index,
-                                        left_stride, left_data, right_index,
-                                        right_stride, right_data);
-      // Update index.
-      for (int j = 0; j < num_squeezed_dims; ++j) {
-        BlockIteratorState& state = block_iter_state[j];
-        if (++state.count < state.size) {
-          output_index += state.output_stride;
-          left_index += state.left_stride;
-          right_index += state.right_stride;
-          break;
-        }
-        state.count = 0;
-        output_index -= state.output_span;
-        left_index -= state.left_span;
-        right_index -= state.right_span;
-      }
-    }
-  }
-};
-
-/**
- * \class TensorBlockView
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Read-only view into a block of data.
- *
- * This class provides read-only access to a block of data in impl. It may need
- * to allocate space for holding the intermediate result.
- *
- */
-template <class ArgType, class Device>
-struct TensorBlockView {
-  typedef TensorEvaluator<ArgType, Device> Impl;
-  typedef typename Impl::Index StorageIndex;
-  typedef typename remove_const<typename Impl::Scalar>::type Scalar;
-  static const int NumDims = array_size<typename Impl::Dimensions>::value;
-  typedef DSizes<StorageIndex, NumDims> Dimensions;
-
-  // Constructs a TensorBlockView for `impl`. `block` is only used for for
-  // specifying the start offset, shape, and strides of the block.
-  template <typename OtherTensorBlock>
-  TensorBlockView(const Device& device,
-                  const TensorEvaluator<ArgType, Device>& impl,
-                  const OtherTensorBlock& block)
-      : m_device(device),
-        m_block_sizes(block.block_sizes()),
-        m_data(NULL),
-        m_allocated_data(NULL) {
-    if (Impl::RawAccess && impl.data() != NULL) {
-      m_data = impl.data() + block.first_coeff_index();
-      m_block_strides = block.tensor_strides();
-    } else {
-      // Actually make a copy.
-
-      // TODO(wuke): This sometimes put a lot pressure on the heap allocator.
-      // Consider allowing ops to request additional temporary block memory in
-      // TensorOpResourceRequirements.
-      m_allocated_data = static_cast<Scalar*>(
-          m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
-      m_data = m_allocated_data;
-      if (NumDims > 0) {
-        if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
-          m_block_strides[0] = 1;
-          for (int i = 1; i < NumDims; ++i) {
-            m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
-          }
-        } else {
-          m_block_strides[NumDims - 1] = 1;
-          for (int i = NumDims - 2; i >= 0; --i) {
-            m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
-          }
-        }
-      }
-      TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
-          block.first_coeff_index(), m_block_sizes, m_block_strides,
-          block.tensor_strides(), m_allocated_data);
-      impl.block(&input_block);
-    }
-  }
-
-  ~TensorBlockView() {
-    if (m_allocated_data != NULL) {
-      m_device.deallocate(m_allocated_data);
-    }
-  }
-
-  const Dimensions& block_sizes() const { return m_block_sizes; }
-  const Dimensions& block_strides() const { return m_block_strides; }
-  const Scalar* data() const { return m_data; }
-
- private:
-  const Device EIGEN_DEVICE_REF m_device;
-  Dimensions m_block_sizes, m_block_strides;
-  const Scalar* m_data;      // Not owned.
-  Scalar* m_allocated_data;  // Owned.
-};
-
 /**
 * \class TensorBlockMapper
 * \ingroup CXX11_Tensor_Module
@ -1108,137 +332,6 @@ class TensorBlockMapper {
  StorageIndex m_total_block_count;
 };

-/**
- * \class TensorSliceBlockMapper
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor slice block mapper class.
- *
- * This class is responsible for iterating over the blocks of
- * a slice of a tensor. Supports shuffling of the block strides
- * for callers that want to reduce strides for dimensions to be
- * processed together.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorSliceBlockMapper {
- public:
-  typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
-  typedef DSizes<StorageIndex, NumDims> Dimensions;
-
-  TensorSliceBlockMapper(const Dimensions& tensor_dims,
-                         const Dimensions& tensor_slice_offsets,
-                         const Dimensions& tensor_slice_extents,
-                         const Dimensions& block_dim_sizes,
-                         const Dimensions& block_stride_order)
-      : m_tensor_dimensions(tensor_dims),
-        m_tensor_slice_offsets(tensor_slice_offsets),
-        m_tensor_slice_extents(tensor_slice_extents),
-        m_block_dim_sizes(block_dim_sizes),
-        m_block_stride_order(block_stride_order),
-        m_total_block_count(1) {
-    // Calculate block counts by dimension and total block count.
-    DSizes<StorageIndex, NumDims> block_count;
-    for (Index i = 0; i < block_count.rank(); ++i) {
-      block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_block_strides[0] = 1;
-      m_tensor_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
-        m_tensor_strides[i] =
-            m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1];
-      }
-    } else {
-      m_block_strides[NumDims - 1] = 1;
-      m_tensor_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
-        m_tensor_strides[i] =
-            m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
-  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
-    StorageIndex first_coeff_index = 0;
-    DSizes<StorageIndex, NumDims> coords;
-    DSizes<StorageIndex, NumDims> sizes;
-    DSizes<StorageIndex, NumDims> strides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = block_index / m_block_strides[i];
-        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
-        sizes[i] = numext::mini(
-            m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
-            m_block_dim_sizes[i]);
-        block_index -= idx * m_block_strides[i];
-        first_coeff_index += coords[i] * m_tensor_strides[i];
-      }
-      coords[0] =
-          m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0];
-      sizes[0] = numext::mini(
-          m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
-          m_block_dim_sizes[0]);
-      first_coeff_index += coords[0] * m_tensor_strides[0];
-
-      StorageIndex prev_dim = m_block_stride_order[0];
-      strides[prev_dim] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        const StorageIndex curr_dim = m_block_stride_order[i];
-        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
-        prev_dim = curr_dim;
-      }
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const StorageIndex idx = block_index / m_block_strides[i];
-        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
-        sizes[i] = numext::mini(
-            m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
-            m_block_dim_sizes[i]);
-        block_index -= idx * m_block_strides[i];
-        first_coeff_index += coords[i] * m_tensor_strides[i];
-      }
-      coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
-                            block_index * m_block_dim_sizes[NumDims - 1];
-      sizes[NumDims - 1] = numext::mini(
-          m_tensor_slice_offsets[NumDims - 1] +
-              m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
-          m_block_dim_sizes[NumDims - 1]);
-      first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
-
-      StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
-      strides[prev_dim] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        const StorageIndex curr_dim = m_block_stride_order[i];
-        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
-        prev_dim = curr_dim;
-      }
-    }
-
-    return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
-    return m_total_block_count;
-  }
-
- private:
-  Dimensions m_tensor_dimensions;
-  Dimensions m_tensor_slice_offsets;
-  Dimensions m_tensor_slice_extents;
-  Dimensions m_tensor_strides;
-  Dimensions m_block_dim_sizes;
-  Dimensions m_block_stride_order;
-  Dimensions m_block_strides;
-  StorageIndex m_total_block_count;
-};
-
 }  // namespace internal

 }  // namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  enum {
    IsAligned         = true,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -123,21 +122,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  // Block based access to the XprType (input) tensor.
-  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlock;
-  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlockReader;
-
  // We do block based broadcasting using a trick with 2x tensor rank and 0
  // strides. See block method implementation for details.
  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;

-  typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
-      BroadcastTensorBlock;
-  typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
-      BroadcastTensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -641,246 +629,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    m_impl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (NumDims <= 0) {
-      output_block->data()[0] = m_impl.coeff(0);
-      return;
-    }
-
-    // Because we only support kSkewedInnerDims blocking, block size should be
-    // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
-    // for the first outer dim, and 1 for other outer dims. This is guaranteed
-    // by MergeResourceRequirements() in TensorBlock.h.
-    const Dimensions& output_block_sizes = output_block->block_sizes();
-    const Dimensions& output_block_strides = output_block->block_strides();
-
-    // Find where outer dims start.
-    int outer_dim_start = 0;
-    Index outer_dim_size = 1, inner_dim_size = 1;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                          ? i
-                          : NumDims - i - 1;
-      if (i > outer_dim_start) {
-        eigen_assert(output_block_sizes[dim] == 1);
-      } else if (output_block_sizes[dim] != m_dimensions[dim]) {
-        eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
-        outer_dim_size = output_block_sizes[dim];
-      } else {
-        inner_dim_size *= output_block_sizes[dim];
-        ++outer_dim_start;
-      }
-    }
-
-    if (inner_dim_size == 0 || outer_dim_size == 0) {
-      return;
-    }
-
-    const Dimensions& input_dims = Dimensions(m_impl.dimensions());
-
-    // Pre-fill input_block_sizes, broadcast_block_sizes,
-    // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
-    // only modify the outer_dim_start-th dimension on these arrays.
-
-    // Calculate the input block size for looking into the input.
-    Dimensions input_block_sizes;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < outer_dim_start; ++i) {
-        input_block_sizes[i] = input_dims[i];
-      }
-      for (int i = outer_dim_start; i < NumDims; ++i) {
-        input_block_sizes[i] = 1;
-      }
-    } else {
-      for (int i = 0; i < outer_dim_start; ++i) {
-        input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
-      }
-      for (int i = outer_dim_start; i < NumDims; ++i) {
-        input_block_sizes[NumDims - i - 1] = 1;
-      }
-    }
-
-    // Broadcast with the 0-stride trick: Create 1 extra dim for each
-    // broadcast, set the input stride to 0.
-    //
-    // When ColMajor:
-    // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
-    //
-    // - broadcast_block_strides is [output_block_strides[0],
-    //                               output_block_strides[0] * d_0,
-    //                               output_block_strides[1],
-    //                               output_block_strides[1] * d_1,
-    //                               ...].
-    //
-    // - broadcast_tensor_strides is [output_block_strides[0],
-    //                                0,
-    //                                output_block_strides[1],
-    //                                0,
-    //                                ...].
-    BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
-        broadcast_tensor_strides;
-
-    for (int i = 0; i < outer_dim_start; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                          ? i
-                          : NumDims - i - 1;
-      const int copy_dim =
-          static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? 2 * i
-              : 2 * NumDims - 2 * i - 1;
-      const int broadcast_dim =
-          static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
-                                                                 : copy_dim - 1;
-      broadcast_block_sizes[copy_dim] = input_dims[dim];
-      broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
-      broadcast_block_strides[copy_dim] = output_block_strides[dim];
-      broadcast_block_strides[broadcast_dim] =
-          output_block_strides[dim] * input_dims[dim];
-      broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
-      broadcast_tensor_strides[broadcast_dim] = 0;
-    }
-    for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                          ? i
-                          : 2 * NumDims - i - 1;
-      broadcast_block_sizes[dim] = 1;
-      broadcast_block_strides[dim] = 0;
-      broadcast_tensor_strides[dim] = 0;
-    }
-
-    const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? outer_dim_start
-                              : NumDims - outer_dim_start - 1;
-
-    if (outer_dim_size == 1) {
-      // We just need one block read using the ready-set values above.
-      BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                     broadcast_block_strides, broadcast_tensor_strides, 0,
-                     output_block);
-    } else if (input_dims[outer_dim] == 1) {
-      // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
-      const int broadcast_outer_dim =
-          static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? 2 * outer_dim_start + 1
-              : 2 * NumDims - 2 * outer_dim_start - 2;
-      broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
-      broadcast_tensor_strides[broadcast_outer_dim] = 0;
-      broadcast_block_strides[broadcast_outer_dim] =
-          output_block_strides[outer_dim];
-      BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                     broadcast_block_strides, broadcast_tensor_strides, 0,
-                     output_block);
-    } else {
-      // The general case. Let's denote the output block as x[...,
-      // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
-      // the outer_dim_start-th dimension (< NumDims). We need to split the
-      // a:a+outer_dim_size into possibly 3 sub-blocks:
-      //
-      // (1) a:b, where b is the smallest multiple of
-      // input_dims[outer_dim_start] in [a, a+outer_dim_size].
-      //
-      // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
-      // in [a, a+outer_dim_size].
-      //
-      // (3) c:a+outer_dim_size .
-      //
-      // Or, when b and c do not exist, we just need to process the whole block
-      // together.
-
-      // Find a.
-      const Index outer_dim_left_index =
-          output_block->first_coeff_index() / m_outputStrides[outer_dim];
-
-      // Find b and c.
-      const Index input_outer_dim_size = input_dims[outer_dim];
-
-      // First multiple after a. This is b when <= outer_dim_left_index +
-      // outer_dim_size.
-      const Index first_multiple =
-          divup<Index>(outer_dim_left_index, input_outer_dim_size) *
-          input_outer_dim_size;
-
-      if (first_multiple <= outer_dim_left_index + outer_dim_size) {
-        // b exists, so does c. Find it.
-        const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
-                                    input_outer_dim_size * input_outer_dim_size;
-        const int copy_outer_dim =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                ? 2 * outer_dim_start
-                : 2 * NumDims - 2 * outer_dim_start - 1;
-        const int broadcast_outer_dim =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                ? 2 * outer_dim_start + 1
-                : 2 * NumDims - 2 * outer_dim_start - 2;
-        if (first_multiple > outer_dim_left_index) {
-          const Index head_size = first_multiple - outer_dim_left_index;
-          input_block_sizes[outer_dim] = head_size;
-          broadcast_block_sizes[copy_outer_dim] = head_size;
-          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
-          broadcast_block_strides[copy_outer_dim] =
-              output_block_strides[outer_dim];
-          broadcast_block_sizes[broadcast_outer_dim] = 1;
-          broadcast_tensor_strides[broadcast_outer_dim] = 0;
-          broadcast_block_strides[broadcast_outer_dim] =
-              output_block_strides[outer_dim] * input_dims[outer_dim];
-          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                         broadcast_block_strides, broadcast_tensor_strides, 0,
-                         output_block);
-        }
-        if (first_multiple < last_multiple) {
-          input_block_sizes[outer_dim] = input_outer_dim_size;
-          broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
-          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
-          broadcast_block_strides[copy_outer_dim] =
-              output_block_strides[outer_dim];
-          broadcast_block_sizes[broadcast_outer_dim] =
-              (last_multiple - first_multiple) / input_outer_dim_size;
-          broadcast_tensor_strides[broadcast_outer_dim] = 0;
-          broadcast_block_strides[broadcast_outer_dim] =
-              output_block_strides[outer_dim] * input_dims[outer_dim];
-          const Index offset = (first_multiple - outer_dim_left_index) *
-                               m_outputStrides[outer_dim];
-          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                         broadcast_block_strides, broadcast_tensor_strides,
-                         offset, output_block);
-        }
-        if (last_multiple < outer_dim_left_index + outer_dim_size) {
-          const Index tail_size =
-              outer_dim_left_index + outer_dim_size - last_multiple;
-          input_block_sizes[outer_dim] = tail_size;
-          broadcast_block_sizes[copy_outer_dim] = tail_size;
-          broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
-          broadcast_block_strides[copy_outer_dim] =
-              output_block_strides[outer_dim];
-          broadcast_block_sizes[broadcast_outer_dim] = 1;
-          broadcast_tensor_strides[broadcast_outer_dim] = 0;
-          broadcast_block_strides[broadcast_outer_dim] =
-              output_block_strides[outer_dim] * input_dims[outer_dim];
-          const Index offset = (last_multiple - outer_dim_left_index) *
-                               m_outputStrides[outer_dim];
-          BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                         broadcast_block_strides, broadcast_tensor_strides,
-                         offset, output_block);
-        }
-      } else {
-        // b and c do not exist.
-        const int copy_outer_dim =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                ? 2 * outer_dim_start
-                : 2 * NumDims - 2 * outer_dim_start - 1;
-        input_block_sizes[outer_dim] = outer_dim_size;
-        broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
-        broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
-        broadcast_block_strides[copy_outer_dim] =
-            output_block_strides[outer_dim];
-        BroadcastBlock(input_block_sizes, broadcast_block_sizes,
-                       broadcast_block_strides, broadcast_tensor_strides, 0,
-                       output_block);
-      }
-    }
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -1096,28 +844,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    return params;
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
-      const Dimensions& input_block_sizes,
-      const BroadcastDimensions& broadcast_block_sizes,
-      const BroadcastDimensions& broadcast_block_strides,
-      const BroadcastDimensions& broadcast_tensor_strides, Index offset,
-      TensorBlock* output_block) const {
-    TensorBlock input_view_block(
-        static_cast<int>(Layout) == static_cast<int>(ColMajor)
-            ? indexColMajor(output_block->first_coeff_index() + offset)
-            : indexRowMajor(output_block->first_coeff_index() + offset),
-        input_block_sizes, Dimensions(m_inputStrides),
-        Dimensions(m_inputStrides), NULL);
-
-    internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
-                                                           input_view_block);
-    BroadcastTensorBlock broadcast_block(
-        0, broadcast_block_sizes, broadcast_block_strides,
-        broadcast_tensor_strides, output_block->data() + offset);
-
-    BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
    DSizes<Index, NumDims> dimensions;
    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -148,7 +148,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    IsAligned         = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    // Chipping of outer-most dimension is a trivial operation, because we can
    // read and write directly from the underlying tensor using single offset.
@ -167,11 +166,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
-      InputTensorBlock;
-  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
-      OutputTensorBlock;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -218,20 +212,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    }
    m_inputStride *= input_dims[m_dim.actualDim()];
    m_inputOffset = m_stride * op.offset();
-
-    if (BlockAccess) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_inputStrides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
-        }
-      } else {
-        m_inputStrides[NumInputDims - 1] = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
-        }
-      }
-    }
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -323,52 +303,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    m_impl.getResourceRequirements(resources);
  }

-  // TODO(andydavis) Reduce the overhead of this function (experiment with
-  // using a fixed block size).
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    // Calculate input block sizes.
-    const DSizes<Index, NumDims>& output_block_sizes =
-        output_block->block_sizes();
-    const DSizes<Index, NumDims>& output_block_strides =
-        output_block->block_strides();
-    const Index chip_dim = m_dim.actualDim();
-    DSizes<Index, NumInputDims> input_block_sizes;
-    DSizes<Index, NumInputDims> input_block_strides;
-    for (Index i = 0; i < NumInputDims; ++i) {
-      if (i < chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i];
-        input_block_strides[i] = output_block_strides[i];
-      } else if (i > chip_dim) {
-        input_block_sizes[i] = output_block_sizes[i - 1];
-        input_block_strides[i] = output_block_strides[i - 1];
-      } else {
-        input_block_sizes[i] = 1;
-      }
-    }
-    // Fix up input_block_stride for chip dimension.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (chip_dim == 0) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] =
-            input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
-      }
-    } else {
-      if (chip_dim == NumInputDims - 1) {
-        input_block_strides[chip_dim] = 1;
-      } else {
-        input_block_strides[chip_dim] =
-            input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
-      }
-    }
-    // Instantiate and read input block from input tensor.
-    InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                                 input_block_sizes, input_block_strides,
-                                 m_inputStrides, output_block->data());
-    m_impl.block(&input_block);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool root_of_expr_ast = false) const {
@ -482,7 +416,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  Index m_stride;
  Index m_inputOffset;
  Index m_inputStride;
-  DSizes<Index, NumInputDims> m_inputStrides;
  TensorEvaluator<ArgType, Device> m_impl;
  const internal::DimensionId<DimId> m_dim;
  const Device EIGEN_DEVICE_REF m_device;
@ -508,7 +441,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
  enum {
    IsAligned     = false,
    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess   = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
    Layout        = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess     = false
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -125,7 +125,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
    BlockAccessV2     = false,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
@ -325,7 +324,6 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
    BlockAccessV2     = false,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -381,7 +381,6 @@ struct TensorContractionEvaluatorBase
  enum {
    IsAligned         = true,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = false,
    BlockAccessV2     = false,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -302,7 +302,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
                        TensorEvaluator<ArgType, Device>::PacketAccess &
                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
    #endif
-    BlockAccess       = false,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -309,7 +309,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  enum {
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, Device>::Layout,
@ -787,7 +786,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  enum {
    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -242,7 +242,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  enum {
    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -95,7 +95,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  enum {
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<XprType, Device>::Layout,
@ -269,7 +268,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  enum {
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -110,7 +110,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  enum {
    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = true,
    BlockAccessV2     = true,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -120,9 +119,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>

  static const int NumDims = internal::traits<ArgType>::NumDimensions;

-  typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> TensorBlock;
-  typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout> TensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -173,13 +169,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
    m_impl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
-    TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
-                              block->tensor_strides(), block->tensor_strides(),
-                              m_buffer + block->first_coeff_index());
-    m_impl.block(&eval_to_block);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
    // Add `m_buffer` as destination buffer to the block descriptor.
@ -216,11 +205,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
-    assert(m_buffer != NULL);
-    TensorBlockReader::Run(block, m_buffer);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    // We assume that evalPacket or evalScalar is called to perform the
    // assignment and account for the cost of the write here.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -45,7 +45,6 @@ struct TensorEvaluator
  enum {
    IsAligned          = Derived::IsAligned,
    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
    BlockAccessV2      = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
    PreferBlockAccess  = false,
    Layout             = Derived::Layout,
@ -55,13 +54,6 @@ struct TensorEvaluator

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlock;
-  typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlockReader;
-  typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlockWriter;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -160,11 +152,6 @@ struct TensorEvaluator
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
      std::vector<internal::TensorOpResourceRequirements>*) const {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
-    assert(m_data != NULL);
-    TensorBlockReader::Run(block, m_data);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -172,12 +159,6 @@ struct TensorEvaluator
    return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlock& block) {
-    assert(m_data != NULL);
-    TensorBlockWriter::Run(block, m_data);
-  }
-
  template<typename TensorBlockV2>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
      const TensorBlockDesc& desc, const TensorBlockV2& block) {
@ -263,7 +244,6 @@ struct TensorEvaluator<const Derived, Device>
  enum {
    IsAligned         = Derived::IsAligned,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
    BlockAccessV2     = internal::is_arithmetic<ScalarNoConst>::value,
    PreferBlockAccess = false,
    Layout            = Derived::Layout,
@ -271,11 +251,6 @@ struct TensorEvaluator<const Derived, Device>
    RawAccess         = true
  };

-  typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlock;
-  typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -348,11 +323,6 @@ struct TensorEvaluator<const Derived, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
      std::vector<internal::TensorOpResourceRequirements>*) const {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
-    assert(m_data != NULL);
-    TensorBlockReader::Run(block, m_data);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -404,7 +374,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
    &&  (PacketType<CoeffReturnType, Device>::size >1)
    #endif
    ,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -475,7 +444,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
                         internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2      = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout             = TensorEvaluator<ArgType, Device>::Layout,
@ -554,24 +522,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
    m_argImpl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (NumDims <= 0) {
-      output_block->data()[0] = coeff(0);
-      return;
-    }
-    internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
-                                                         *output_block);
-    internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
-                                      Layout>::Run(m_functor,
-                                                   output_block->block_sizes(),
-                                                   output_block
-                                                       ->block_strides(),
-                                                   output_block->data(),
-                                                   arg_block.block_strides(),
-                                                   arg_block.data());
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -608,8 +558,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
                        TensorEvaluator<RightArgType, Device>::PacketAccess &
                        internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
@ -713,24 +661,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
    m_rightImpl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (NumDims <= 0) {
-      output_block->data()[0] = coeff(Index(0));
-      return;
-    }
-    internal::TensorBlockView<LeftArgType, Device> left_block(
-        m_device, m_leftImpl, *output_block);
-    internal::TensorBlockView<RightArgType, Device> right_block(
-        m_device, m_rightImpl, *output_block);
-    internal::TensorBlockCwiseBinaryIO<
-        BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
-        Layout>::Run(m_functor, output_block->block_sizes(),
-                     output_block->block_strides(), output_block->data(),
-                     left_block.block_strides(), left_block.data(),
-                     right_block.block_strides(), right_block.data());
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -768,7 +698,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
                        internal::functor_traits<TernaryOp>::PacketAccess,
-    BlockAccess       = false,
    BlockAccessV2     = false,
    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
@ -887,7 +816,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
                        PacketType<Scalar, Device>::HasBlend,
-    BlockAccess       = false,
    BlockAccessV2     = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
                        TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
                        TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -153,70 +153,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
  }
 };

-/**
- * Process all the data with a single cpu thread, using blocks of data. By
- * sizing a block to fit L1 cache we get better cache performance.
- */
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
-  typedef typename traits<Expression>::Index StorageIndex;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
-    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
-    typedef typename TensorBlock::Dimensions TensorBlockDimensions;
-
-    Evaluator evaluator(expr, device);
-    Index total_size = array_prod(evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size
-        && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
-      evaluator.cleanup();
-      return;
-    }
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      // Size tensor blocks to fit in cache (or requested target block size).
-      Index block_total_size = numext::mini(cache_size, total_size);
-      TensorBlockShapeType block_shape = kSkewedInnerDims;
-      // Query expression tree for desired block size/shape.
-      std::vector<TensorOpResourceRequirements> resources;
-      evaluator.getResourceRequirements(&resources);
-      MergeResourceRequirements(resources, &block_shape, &block_total_size);
-
-      TensorBlockMapper block_mapper(
-          TensorBlockDimensions(evaluator.dimensions()), block_shape,
-          block_total_size);
-      block_total_size = block_mapper.block_dims_total_size();
-
-      ScalarNoConst* data = static_cast<ScalarNoConst*>(
-          device.allocate(block_total_size * sizeof(Scalar)));
-
-      const StorageIndex total_block_count = block_mapper.total_block_count();
-      for (StorageIndex i = 0; i < total_block_count; ++i) {
-        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
-        evaluator.evalBlock(&block);
-      }
-      device.deallocate(data);
-    }
-    evaluator.cleanup();
-  }
-};
-
 /**
 * Process all the data with a single cpu thread, using blocks of data. By
 * sizing a block to fit L1 cache we get better cache performance.
@ -446,59 +382,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
  }
 };

-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Index StorageIndex;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    Evaluator evaluator(expr, device);
-    Index total_size = array_prod(evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size &&
-        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                               /*Tiling=*/TiledEvaluation::Off>::run(expr,
-                                                                     device);
-      evaluator.cleanup();
-      return;
-    }
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const TilingContext tiling =
-          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
-                                                   Vectorizable>(device, evaluator);
-
-      device.parallelFor(
-          tiling.block_mapper.total_block_count(), tiling.cost,
-          [=, &device, &evaluator, &tiling](StorageIndex firstIdx,
-                                            StorageIndex lastIdx) {
-            ScalarNoConst* thread_buf =
-                tiling.template GetCurrentThreadBuffer<ScalarNoConst>(device);
-            for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-              auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf);
-              evaluator.evalBlock(&block);
-            }
-          });
-      device.deallocate(tiling.buffer);
-    }
-    evaluator.cleanup();
-  }
-};
-
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
                     /*Tiling=*/TiledEvaluation::On> {
@ -603,91 +486,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
  };
 };

-template <typename Expression, typename DoneCallback, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, /*Tileable*/ TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Index StorageIndex;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
-                            Evaluator::Layout>
-      BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    Index total_size = array_prod(ctx->evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size &&
-        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      auto delete_ctx = [ctx]() { delete ctx; };
-      internal::TensorAsyncExecutor<
-          Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
-          /*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx));
-      return;
-    }
-
-    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      ctx->tiling =
-          GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(
-              device, ctx->evaluator);
-
-      auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
-        ScalarNoConst* thread_buf =
-            ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
-                ctx->device);
-        for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-          auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
-          ctx->evaluator.evalBlock(&block);
-        }
-      };
-      device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
-                              ctx->tiling.cost, eval_block,
-                              [ctx]() { delete ctx; });
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : device(thread_pool),
-          evaluator(expr, thread_pool),
-          on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      device.deallocate(tiling.buffer);
-      evaluator.cleanup();
-      on_done();
-    }
-
-    const ThreadPoolDevice& device;
-    Evaluator evaluator;
-    TilingContext tiling;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
 template <typename Expression, typename DoneCallback, bool Vectorizable>
 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@ -133,7 +133,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
  enum {
    IsAligned = false,
    PacketAccess = true,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@ -41,7 +41,6 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
    enum {
      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccess = false,
      BlockAccessV2 = false,
      PreferBlockAccess = false,
      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
  enum {
    IsAligned         = true,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
    BlockAccessV2     = internal::is_arithmetic<CoeffReturnType>::value,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -105,11 +104,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>

  static const int NumDims = internal::traits<ArgType>::NumDimensions;

-  typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
-      TensorBlock;
-  typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout>
-      TensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -185,11 +179,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
      std::vector<internal::TensorOpResourceRequirements>*) const {}

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
-    assert(m_buffer != NULL);
-    TensorBlockReader::Run(block, m_buffer);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -158,7 +158,6 @@ struct IsVectorizable<GpuDevice, Expression> {
 enum TiledEvaluation {
  Off = 0,    // tiled evaluation is not supported
  On = 1,     // still work in progress (see TensorBlockV2.h)
-  Legacy = 2  // soon to be deprecated (see TensorBock.h)
 };

 template <typename Device, typename Expression>
@ -166,18 +165,12 @@ struct IsTileable {
  // Check that block evaluation is supported and it's a preferred option (at
  // least one sub-expression has much faster block evaluation, e.g.
  // broadcasting).
-  static const bool BlockAccess =
-      TensorEvaluator<Expression, Device>::BlockAccess &&
-      TensorEvaluator<Expression, Device>::PreferBlockAccess;
-
  static const bool BlockAccessV2 =
      TensorEvaluator<Expression, Device>::BlockAccessV2 &&
      TensorEvaluator<Expression, Device>::PreferBlockAccess;

  static const TiledEvaluation value =
-      BlockAccessV2
-          ? TiledEvaluation::On
-          : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
+      BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off;
 };

 template <typename Expression, typename Device,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -93,7 +93,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  enum {
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = true,
    BlockAccessV2     = true,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -183,60 +182,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    Index count;
  };

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (NumDims <= 0) return;
-
-    static const bool is_col_major =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    // Compute spatial coordinates for the first block element.
-    array<Index, NumDims> coords;
-    extract_coordinates(output_block->first_coeff_index(), coords);
-    array<Index, NumDims> initial_coords = coords;
-
-    CoeffReturnType* data = output_block->data();
-    Index offset = 0;
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (Index i = 0; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - 1 - i;
-      it[i].size = output_block->block_sizes()[dim];
-      it[i].stride = output_block->block_strides()[dim];
-      it[i].span = it[i].stride * (it[i].size - 1);
-      it[i].count = 0;
-    }
-    eigen_assert(it[0].stride == 1);
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      // Generate data for the inner-most dimension.
-      for (Index i = 0; i < it[0].size; ++i) {
-        *(data + offset + i) = m_generator(coords);
-        coords[is_col_major ? 0 : NumDims - 1]++;
-      }
-      coords[is_col_major ? 0 : NumDims - 1] =
-          initial_coords[is_col_major ? 0 : NumDims - 1];
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if (NumDims == 1) break;
-
-      // Update offset.
-      for (Index i = 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          offset += it[i].stride;
-          coords[is_col_major ? i : NumDims - 1 - i]++;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        coords[is_col_major ? i : NumDims - 1 - i] =
-            initial_coords[is_col_major ? i : NumDims - 1 - i];
-        offset -= it[i].span;
-      }
-    }
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -231,7 +231,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  enum {
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = true,
    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -541,139 +540,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
        internal::kSkewedInnerDims, block_total_size_max));
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
-    typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
-
-    // Calculate loop limits and various input/output dim sizes.
-    const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
-    const bool col_major =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-    const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
-    const Index output_depth_dim_size =
-        m_dimensions[col_major ? 0 : NumDims - 1];
-    const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
-    const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
-    const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
-    const Index block_col_stride = row_dim_size * depth_dim_size;
-    const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
-    const Index outer_dim_size =
-        block_sizes.TotalSize() /
-        (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
-
-    const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
-    const Index batch_size = patch_size * patch_index_dim_size;
-
-    Index output_index = output_block->first_coeff_index();
-
-    // Loop through outer dimensions.
-    for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
-         ++outer_dim_index) {
-      const Index outer_output_base_index = outer_dim_index * batch_size;
-      // Find the offset of the element wrt the location of the first element.
-      const Index patchIndexStart = output_index / m_fastPatchStride;
-      const Index patchOffset =
-          (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
-      const Index colOffsetStart = patchOffset / m_fastColStride;
-      // Other ways to index this element.
-      const Index otherIndex =
-          (NumDims == 4) ? 0 : output_index / m_fastOtherStride;
-      const Index patch2DIndexStart =
-          (NumDims == 4)
-              ? 0
-              : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
-      // Calculate starting depth index.
-      const Index depth = output_index - (output_index / m_fastOutputDepth) *
-                                             output_depth_dim_size;
-      const Index patch_input_base_index =
-          depth + otherIndex * m_patchInputStride;
-
-      // Loop through patches.
-      for (Index patch_index_dim_index = 0;
-           patch_index_dim_index < patch_index_dim_size;
-           ++patch_index_dim_index) {
-        const Index patch_output_base_index =
-            outer_output_base_index + patch_index_dim_index * patch_size;
-        // Patch index corresponding to the passed in index.
-        const Index patchIndex = patchIndexStart + patch_index_dim_index;
-        const Index patch2DIndex =
-            (NumDims == 4) ? patchIndex
-                           : patch2DIndexStart + patch_index_dim_index;
-        const Index colIndex = patch2DIndex / m_fastOutputRows;
-        const Index input_col_base = colIndex * m_col_strides;
-        const Index row_offset_base =
-            (patch2DIndex - colIndex * m_outputRows) * m_row_strides -
-            m_rowPaddingTop;
-
-        // Loop through columns.
-        for (Index col_dim_index = 0; col_dim_index < col_dim_size;
-             ++col_dim_index) {
-          const Index col_output_base_index =
-              patch_output_base_index + col_dim_index * block_col_stride;
-
-          // Calculate col index in the input original tensor.
-          Index colOffset = colOffsetStart + col_dim_index;
-          Index inputCol =
-              input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
-          Index origInputCol =
-              (m_col_inflate_strides == 1)
-                  ? inputCol
-                  : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
-
-          bool pad_column = false;
-          if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-              ((m_col_inflate_strides != 1) &&
-               (inputCol != origInputCol * m_col_inflate_strides))) {
-            pad_column = true;
-          }
-
-          const Index col_input_base_index =
-              patch_input_base_index + origInputCol * m_colInputStride;
-          const Index input_row_base =
-              row_offset_base +
-              ((patchOffset + col_dim_index * output_row_dim_size) -
-               colOffset * m_colStride) *
-                  m_in_row_strides;
-          // Loop through rows.
-          for (Index row_dim_index = 0; row_dim_index < row_dim_size;
-               ++row_dim_index) {
-            const Index output_base_index =
-                col_output_base_index + row_dim_index * depth_dim_size;
-            bool pad_row = false;
-            Index inputIndex;
-            if (!pad_column) {
-              Index inputRow =
-                  input_row_base + row_dim_index * m_in_row_strides;
-              Index origInputRow =
-                  (m_row_inflate_strides == 1)
-                      ? inputRow
-                      : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
-                                         : 0);
-              if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-                  ((m_row_inflate_strides != 1) &&
-                   (inputRow != origInputRow * m_row_inflate_strides))) {
-                pad_row = true;
-              } else {
-                inputIndex =
-                    col_input_base_index + origInputRow * m_rowInputStride;
-              }
-            }
-            // Copy (or pad) along depth dimension.
-            if (pad_column || pad_row) {
-              ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
-                                       output_base_index, output_block->data());
-            } else {
-              ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
-                                    output_block->data(), inputIndex);
-            }
-          }
-        }
-      }
-      output_index += m_otherStride;
-    }
-  }
-
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@ -92,7 +92,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -119,7 +119,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
  enum {
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
@ -199,7 +198,6 @@ template<typename ArgType, typename Device>
  enum {
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -135,11 +135,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
  enum {
    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
-    // performance issue with block-based reshape is resolved.
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
-                        TensorEvaluator<ArgType, Device>::RawAccess &&
-                        NumInputDims > 0 && NumOutputDims > 0,
    // For trivial reshapes with raw access to underlying data we will provide
    // zero overhead block access.
    // TODO(ezhulenev): Consider adding block access without raw access?
@ -153,14 +148,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
-      InputTensorBlock;
-  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
-      OutputTensorBlock;
-  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
-                                      Layout>
-      OutputTensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -177,30 +164,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
    // The total size of the reshaped tensor must be equal to the total size
    // of the input tensor.
    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
-
-    if (BlockAccess) {
-      const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
-          m_impl.dimensions();
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumOutputDims; ++i) {
-          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        }
-        m_inputStrides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
-        }
-      } else {
-        m_outputStrides[NumOutputDims - 1] = 1;
-        for (int i = NumOutputDims - 2; i >= 0; --i) {
-          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        }
-        m_inputStrides[NumInputDims - 1] = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
-        }
-      }
-    }
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@ -249,128 +212,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
    Index size;
    Index count;
  };
-  // TODO(andydavis) Reduce the overhead of this function.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    if (m_impl.data() != NULL) {
-      OutputTensorBlockReader::Run(output_block, m_impl.data());
-      return;
-    }
-
-    // Calculate output block unit-stride inner dimension length.
-    const DSizes<Index, NumOutputDims>& output_block_sizes =
-        output_block->block_sizes();
-    Index output_inner_dim_size = 1;
-    Index output_outer_dim_start = NumOutputDims;
-    for (Index i = 0; i < NumOutputDims; ++i) {
-      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                        ? i : NumOutputDims - i - 1;
-      output_inner_dim_size *= output_block_sizes[dim];
-      if (output_block_sizes[dim] < m_dimensions[dim]) {
-        output_outer_dim_start = i + 1;
-        break;
-      }
-    }
-
-    // Initialize output block iterator state.
-    array<BlockIteratorState, NumOutputDims> block_iter_state;
-
-    for (Index i = 0; i < NumOutputDims; ++i) {
-      const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                        ? i : NumOutputDims - i - 1;
-      block_iter_state[i].size = output_block_sizes[dim];
-      block_iter_state[i].stride = m_outputStrides[dim];
-      block_iter_state[i].span =
-          block_iter_state[i].stride * (block_iter_state[i].size - 1);
-      block_iter_state[i].count = 0;
-    }
-
-    const Index output_outer_dim_size = output_block_sizes.TotalSize() /
-        output_inner_dim_size;
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
-        m_impl.dimensions();
-
-    Index index = output_block->first_coeff_index();
-    for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
-      Index inner_idx = 0;
-      while (inner_idx < output_inner_dim_size) {
-        // Calculate input coords based on 'index'.
-        array<Index, NumInputDims> input_coords;
-        Index idx = index;
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          for (int i = NumInputDims - 1; i > 0; --i) {
-            input_coords[i] = idx / m_inputStrides[i];
-            idx -= input_coords[i] * m_inputStrides[i];
-          }
-          input_coords[0] = idx;
-        } else {
-          for (int i = 0; i < NumInputDims - 1; ++i) {
-            input_coords[i] = idx / m_inputStrides[i];
-            idx -= input_coords[i] * m_inputStrides[i];
-          }
-          input_coords[NumInputDims - 1] = idx;
-        }
-
-        // Calculate target input block shape, using at most
-        // 'output_inner_dim_size' coefficients along the input block's inner
-        // dimensions.
-        DSizes<Index, NumInputDims> input_block_sizes;
-        Index num_to_allocate = output_inner_dim_size - inner_idx;
-        for (Index i = 0; i < NumInputDims; ++i) {
-          const Index dim =
-              static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? i : NumInputDims - i - 1;
-          input_block_sizes[dim] = numext::mini(
-              num_to_allocate, (static_cast<Index>(input_dims[dim]) -
-                  input_coords[dim]));
-          if (input_coords[dim] == 0) {
-            num_to_allocate /= input_block_sizes[dim];
-          } else {
-            num_to_allocate = 1;
-          }
-        }
-
-        // Calculate input block strides.
-        DSizes<Index, NumInputDims> input_block_strides;
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          input_block_strides[0] = 1;
-          for (int i = 1; i < NumInputDims; ++i) {
-            input_block_strides[i] = input_block_strides[i - 1] *
-                input_block_sizes[i - 1];
-          }
-        } else {
-          input_block_strides[NumInputDims - 1] = 1;
-          for (int i = NumInputDims - 2; i >= 0; --i) {
-            input_block_strides[i] = input_block_strides[i + 1] *
-                input_block_sizes[i + 1];
-          }
-        }
-
-        // Instantiate and read input block from input tensor.
-        InputTensorBlock input_block(index, input_block_sizes,
-                                     input_block_strides, m_inputStrides,
-                                     output_block->data() + outer_idx *
-                                         output_inner_dim_size + inner_idx);
-
-        m_impl.block(&input_block);
-
-        const Index input_block_total_size = input_block_sizes.TotalSize();
-        index += input_block_total_size;
-        inner_idx += input_block_total_size;
-      }
-      eigen_assert(inner_idx == output_inner_dim_size);
-      index -= output_inner_dim_size;
-      // Update index.
-      for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
-        if (++block_iter_state[i].count < block_iter_state[i].size) {
-          index += block_iter_state[i].stride;
-          break;
-        }
-        block_iter_state[i].count = 0;
-        index -= block_iter_state[i].span;
-      }
-    }
-  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
@ -408,8 +249,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
  NewDimensions m_dimensions;
-  DSizes<Index, NumOutputDims> m_outputStrides;
-  DSizes<Index, NumInputDims> m_inputStrides;
 };


@ -426,7 +265,6 @@ template<typename NewDimensions, typename ArgType, typename Device>
  enum {
    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = false,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -619,7 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    // slice offsets and sizes.
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -714,7 +551,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
        }
      }
      // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      const MemcpyTriggerForSlicing<Index, Device, BlockAccessV2> trigger(m_device);
      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
@ -808,16 +645,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    m_impl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                            output_block->block_sizes(),
-                            output_block->block_strides(),
-                            TensorBlockDimensions(m_inputStrides),
-                            output_block->data());
-    m_impl.block(&input_block);
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -922,7 +749,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
  enum {
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -1124,7 +950,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
    // slice offsets and sizes.
    IsAligned = false,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -1306,7 +1131,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
  enum {
    IsAligned = false,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  enum {
    IsAligned         = true,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = false,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -584,7 +584,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
  enum {
    IsAligned = false,
    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = true,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -594,11 +593,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
-      OutputTensorBlock;
-  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
-      InputTensorBlock;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockNotImplemented TensorBlockV2;
  //===--------------------------------------------------------------------===//
@ -920,258 +914,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
    m_impl.getResourceRequirements(resources);
  }

-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    // Special case full reductions to avoid input block copy below.
-    if (NumInputDims == NumReducedDims) {
-      eigen_assert(output_block->first_coeff_index() == 0);
-      eigen_assert(output_block->block_sizes().TotalSize() == 1);
-      Op reducer(m_reducer);
-      output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
-          *this, 0, m_numValuesToReduce, reducer);
-      return;
-    }
-
-    // Calculate input tensor 'slice' required to reduce output block coeffs.
-    DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
-    for (int i = 0; i < NumOutputDims; ++i) {
-      // Clip preserved input dimensions by output block size.
-      input_slice_sizes[m_output_to_input_dim_map[i]] =
-          output_block->block_sizes()[i];
-    }
-
-    // Shard input tensor slice into blocks (because it could be large if we
-    // need to reduce along several dimensions to calculate required output
-    // coefficients).
-    const Index max_coeff_count =
-        numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
-                            input_slice_sizes.TotalSize());
-
-    // Calculate max output shard size needed to keep working set of reducers
-    // in L1, while leaving enough space for reducer overhead and 'PacketSize'
-    // reductions.
-    DSizes<Index, NumInputDims> target_input_block_sizes;
-    CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
-                                   &target_input_block_sizes);
-    // Calculate indices for first preserved dimension.
-    const Index first_preserved_dim_output_index =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor)
-            ? 0
-            : NumOutputDims - 1;
-    const Index first_preserved_dim_input_index =
-        m_output_to_input_dim_map[first_preserved_dim_output_index];
-    const bool inner_most_dim_preserved =
-        PreservingInnerMostDims ||
-        (first_preserved_dim_input_index ==
-         (static_cast<int>(Layout) == static_cast<int>(ColMajor)
-              ? 0
-              : NumInputDims - 1));
-
-    // Calculate output block inner/outer dimension sizes.
-    const Index output_block_inner_dim_size =
-        output_block->block_sizes()[first_preserved_dim_output_index];
-    const Index output_block_outer_dim_size =
-        output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
-    // Calculate shard size for first preserved dimension.
-    const Index output_shard_size =
-        target_input_block_sizes[first_preserved_dim_input_index];
-    const Index num_output_shards =
-        (output_block_inner_dim_size + output_shard_size - 1) /
-        output_shard_size;
-
-    // Initialize 'tensor_slice_offsets' from input coords of output index.
-    DSizes<Index, NumInputDims> tensor_slice_offsets;
-    GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
-                                 &tensor_slice_offsets);
-
-    // Store tensor slice offset in first preserved dimension to be used
-    // to update tensor slice extents in loop below.
-    const Index first_preserved_dim_offset_start =
-        tensor_slice_offsets[first_preserved_dim_input_index];
-
-    array<BlockIteratorState, NumOutputDims> block_iter_state;
-
-    // Initialize state used to iterate through output coefficients
-    // and update 'tensor_slice_offsets' in outer preserved dims.
-    for (int i = 0; i < NumOutputDims - 1; ++i) {
-      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                          ? i + 1
-                          : NumOutputDims - i - 2;
-      block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
-      block_iter_state[i].output_size = output_block->block_sizes()[dim];
-      block_iter_state[i].output_count = 0;
-    }
-
-    // Allocate input block memory.
-    ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
-        m_device.allocate(max_coeff_count * sizeof(Scalar)));
-    // Allocate reducer memory.
-    const bool packet_reductions_enabled =
-        (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
-    const Index num_reducers =
-        (inner_most_dim_preserved && packet_reductions_enabled)
-            ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
-               PacketSize)
-            : output_shard_size;
-    typedef internal::BlockReducer<Self, Op> BlockReducer;
-    BlockReducer* reducers = static_cast<BlockReducer*>(
-        m_device.allocate(num_reducers * sizeof(BlockReducer)));
-
-    InputDimensions input_tensor_dims(m_impl.dimensions());
-    for (Index output_outer_index = 0;
-         output_outer_index < output_block_outer_dim_size;
-         ++output_outer_index) {
-      for (Index output_shard_index = 0; output_shard_index < num_output_shards;
-           ++output_shard_index) {
-        // Initialize 'tensor_slice_extents' for this output shard.
-        DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
-        for (int i = 0; i < NumInputDims; ++i) {
-          if (i == first_preserved_dim_input_index) {
-            // Clip first preserved dim size to output shard size.
-            tensor_slice_extents[i] = numext::mini(
-                output_shard_size,
-                input_slice_sizes[i] - (tensor_slice_offsets[i] -
-                                        first_preserved_dim_offset_start));
-
-          } else if (!m_reduced[i]) {
-            // Clip outer preserved dims to size 1, so that we reduce a
-            // contiguous set of output coefficients.
-            tensor_slice_extents[i] = 1;
-          }
-        }
-
-        // Initialize output coefficient reducers.
-        for (int i = 0; i < num_reducers; ++i) {
-          new (&reducers[i]) BlockReducer(m_reducer);
-        }
-
-        typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
-                                                 NumInputDims, Layout>
-            TensorSliceBlockMapper;
-
-        // TODO(andydavis) Consider removing 'input_block_stride_order' if we
-        // find that scattered reads are not worth supporting in
-        // TensorSliceBlockMapper.
-        TensorSliceBlockMapper block_mapper(
-            typename TensorSliceBlockMapper::Dimensions(input_tensor_dims),
-            tensor_slice_offsets, tensor_slice_extents,
-            target_input_block_sizes, DimensionList<Index, NumInputDims>());
-
-        const Index num_outputs_to_update =
-            tensor_slice_extents[first_preserved_dim_input_index];
-        const Index preserved_dim_vector_reducer_count =
-            (inner_most_dim_preserved && packet_reductions_enabled)
-                ? num_outputs_to_update / PacketSize
-                : 0;
-        const Index preserved_dim_vector_coeff_count =
-            inner_most_dim_preserved
-                ? preserved_dim_vector_reducer_count * PacketSize
-                : 0;
-        const Index preserved_dim_reducer_limit =
-            (inner_most_dim_preserved && packet_reductions_enabled)
-                ? (preserved_dim_vector_reducer_count +
-                   num_outputs_to_update % PacketSize)
-                : num_outputs_to_update;
-
-        const Index total_block_count = block_mapper.total_block_count();
-        for (Index b = 0; b < total_block_count; ++b) {
-          InputTensorBlock input_block =
-              block_mapper.GetBlockForIndex(b, input_block_data);
-          // Read.
-          m_impl.block(&input_block);
-
-          Index num_values_to_reduce = 1;
-          for (Index i = 0; i < NumInputDims; ++i) {
-            if (m_reduced[i]) {
-              num_values_to_reduce *= input_block.block_sizes()[i];
-            }
-          }
-          // Reduce.
-          if (inner_most_dim_preserved) {
-            const Index input_outer_dim_size =
-                input_block.block_sizes().TotalSize() / num_outputs_to_update;
-            for (Index input_outer_dim_index = 0;
-                 input_outer_dim_index < input_outer_dim_size;
-                 ++input_outer_dim_index) {
-              const Index input_outer_dim_base =
-                  input_outer_dim_index * num_outputs_to_update;
-              for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
-                reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
-                                   PacketSize, input_block.data());
-              }
-              const Index scalar_reducer_base =
-                  input_outer_dim_base + preserved_dim_vector_coeff_count;
-              for (Index i = preserved_dim_vector_reducer_count;
-                   i < preserved_dim_reducer_limit; ++i) {
-                reducers[i].Reduce(scalar_reducer_base + i -
-                                       preserved_dim_vector_reducer_count,
-                                   1, input_block.data());
-              }
-            }
-          } else {
-            for (Index i = 0; i < num_outputs_to_update; ++i) {
-              reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
-                                 input_block.data());
-            }
-          }
-        }
-
-        // Finalize all reducers for this output shard.
-        const Index output_base_index =
-            output_outer_index * output_block_inner_dim_size +
-            output_shard_index * output_shard_size;
-        if (inner_most_dim_preserved) {
-          EIGEN_ALIGN_MAX
-              typename internal::remove_const<CoeffReturnType>::type
-                  values[PacketSize];
-          for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
-            const Index reducer_base = output_base_index + i * PacketSize;
-            internal::pstore<CoeffReturnType, PacketReturnType>(
-                values, reducers[i].FinalizePacket());
-            for (Index j = 0; j < PacketSize; ++j) {
-              output_block->data()[reducer_base + j] = values[j];
-            }
-          }
-          const Index scalar_reducer_base =
-              output_base_index + preserved_dim_vector_coeff_count;
-
-          for (Index i = preserved_dim_vector_reducer_count;
-               i < preserved_dim_reducer_limit; ++i) {
-            output_block->data()[scalar_reducer_base + i -
-                                 preserved_dim_vector_reducer_count] =
-                reducers[i].Finalize();
-          }
-        } else {
-          for (int i = 0; i < num_outputs_to_update; ++i) {
-            output_block->data()[output_base_index + i] =
-                reducers[i].Finalize();
-          }
-        }
-
-        // Update 'tensor_slice_offsets' by num outputs for this output shard.
-        tensor_slice_offsets[first_preserved_dim_input_index] +=
-            num_outputs_to_update;
-      }
-      // Update slice offset for inner preserved dim.
-      tensor_slice_offsets[first_preserved_dim_input_index] -=
-          output_block_inner_dim_size;
-      // Update slice offsets for remaining output dims.
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        BlockIteratorState& b = block_iter_state[i];
-        if (++b.output_count < b.output_size) {
-          ++tensor_slice_offsets[b.input_dim];
-          break;
-        }
-        b.output_count = 0;
-        tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
-      }
-    }
-
-    // Free memory.
-    m_device.deallocate(input_block_data);
-    m_device.deallocate(reducers);
-  }
-
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@ -141,7 +141,6 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
    enum {
      IsAligned = false,
      PacketAccess = false,
-      BlockAccess = false,
      BlockAccessV2 = false,
      PreferBlockAccess = false,
      Layout = PlainObjectType::Layout,
@ -378,7 +377,6 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
  enum {
    IsAligned = false,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorRef<Derived>::Layout,
@ -432,7 +430,6 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
  enum {
    IsAligned = false,
    PacketAccess = false,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    RawAccess = false
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  enum {
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = true,
    BlockAccessV2     = NumDims > 0,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -248,112 +247,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
        internal::kSkewedInnerDims, block_total_size_max));
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      OutputTensorBlock* output_block) const {
-    if (NumDims <= 0) return;
-
-    // TODO(ezhulenev): If underlying tensor expression supports and prefers
-    // block evaluation we must use it. Currently we use coeff and packet
-    // access into the underlying tensor expression.
-    // static const bool useBlockAccessForArgType =
-    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
-    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
-
-    static const bool isColMajor =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
-    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
-
-    CoeffReturnType* data = output_block->data();
-    Index block_offset = 0;
-
-    Index input_offset = reverseIndex(output_block->first_coeff_index());
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (Index i = 0; i < NumDims; ++i) {
-      const Index dim = isColMajor ? i : NumDims - 1 - i;
-      it[i].size = output_block->block_sizes()[dim];
-      it[i].count = 0;
-      it[i].reverse = m_reverse[dim];
-
-      it[i].block_stride = output_block->block_strides()[dim];
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-
-      it[i].input_stride = m_strides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      if (it[i].reverse) {
-        it[i].input_stride = -1 * it[i].input_stride;
-        it[i].input_span = -1 * it[i].input_span;
-      }
-    }
-
-    // If multiple inner dimensions have the same reverse flag, check if we can
-    // merge them into a single virtual inner dimension.
-    int effective_inner_dim = 0;
-    for (int i = 1; i < NumDims; ++i) {
-      if (it[i].reverse != it[effective_inner_dim].reverse) break;
-      if (it[i].block_stride != it[effective_inner_dim].size) break;
-      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
-
-      it[i].size = it[effective_inner_dim].size * it[i].size;
-
-      it[i].block_stride = 1;
-      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
-
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      effective_inner_dim = i;
-    }
-
-    eigen_assert(it[effective_inner_dim].block_stride == 1);
-    eigen_assert(it[effective_inner_dim].input_stride ==
-                 (inner_dim_reversed ? -1 : 1));
-
-    const Index inner_dim_size = it[effective_inner_dim].size;
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      // Copy inner-most dimension data from reversed location in input.
-      Index dst = block_offset;
-      Index src = input_offset;
-
-      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
-      // worse results in benchmarks than a simple coefficient loop.
-      if (inner_dim_reversed) {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          data[dst] = m_impl.coeff(src);
-          ++dst;
-          --src;
-        }
-      } else {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          data[dst] = m_impl.coeff(src);
-          ++dst;
-          ++src;
-        }
-      }
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if ((NumDims - effective_inner_dim) == 1) break;
-
-      // Update offset.
-      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          block_offset += it[i].block_stride;
-          input_offset += it[i].input_stride;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        block_offset -= it[i].block_span;
-        input_offset -= it[i].input_span;
-      }
-    }
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool /*root_of_expr_ast*/ = false) const {
@ -535,7 +428,6 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -99,7 +99,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
  enum {
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  enum {
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -125,11 +124,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlock;
-  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlockReader;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@ -249,98 +243,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
        internal::kUniformAllDims, block_total_size_max));
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
-      TensorBlock* output_block) const {
-    if (m_impl.data() != NULL) {
-      // Fast path: we have direct access to the data, so shuffle as we read.
-      TensorBlockReader::Run(output_block,
-                             srcCoeff(output_block->first_coeff_index()),
-                             m_inverseShuffle,
-                             m_unshuffledInputStrides,
-                             m_impl.data());
-      return;
-    }
-
-    // Slow path: read unshuffled block from the input and shuffle in-place.
-    // Initialize input block sizes using input-to-output shuffle map.
-    DSizes<Index, NumDims> input_block_sizes;
-    for (Index i = 0; i < NumDims; ++i) {
-      input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
-    }
-
-    // Calculate input block strides.
-    DSizes<Index, NumDims> input_block_strides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      input_block_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        input_block_strides[i] =
-            input_block_strides[i - 1] * input_block_sizes[i - 1];
-      }
-    } else {
-      input_block_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        input_block_strides[i] =
-            input_block_strides[i + 1] * input_block_sizes[i + 1];
-      }
-    }
-    DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
-    for (int i = 0; i < NumDims; ++i) {
-      fast_input_block_strides[i] =
-          internal::TensorIntDivisor<Index>(input_block_strides[i]);
-    }
-
-    // Read input block.
-    TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
-                            input_block_sizes,
-                            input_block_strides,
-                            Dimensions(m_unshuffledInputStrides),
-                            output_block->data());
-
-    m_impl.block(&input_block);
-
-    // Naive In-place shuffle: random IO but block size is O(L1 cache size).
-    // TODO(andydavis) Improve the performance of this in-place shuffle.
-    const Index total_size = input_block_sizes.TotalSize();
-    std::vector<bool> bitmap(total_size, false);
-    ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
-    const DSizes<Index, NumDims>& output_block_strides =
-        output_block->block_strides();
-    for (Index input_index = 0; input_index < total_size; ++input_index) {
-      if (bitmap[input_index]) {
-        // Coefficient at this index has already been shuffled.
-        continue;
-      }
-
-      Index output_index =
-          GetBlockOutputIndex(input_index, input_block_strides,
-                              output_block_strides, fast_input_block_strides);
-      if (output_index == input_index) {
-        // Coefficient already in place.
-        bitmap[output_index] = true;
-        continue;
-      }
-
-      // The following loop starts at 'input_index', and shuffles
-      // coefficients into their shuffled location at 'output_index'.
-      // It skips through the array shuffling coefficients by following
-      // the shuffle cycle starting and ending a 'start_index'.
-      ScalarNoConst evicted_value;
-      ScalarNoConst shuffled_value = data[input_index];
-      do {
-        evicted_value = data[output_index];
-        data[output_index] = shuffled_value;
-        shuffled_value = evicted_value;
-        bitmap[output_index] = true;
-        output_index =
-            GetBlockOutputIndex(output_index, input_block_strides,
-                                output_block_strides, fast_input_block_strides);
-      } while (output_index != input_index);
-
-      data[output_index] = shuffled_value;
-      bitmap[output_index] = true;
-    }
-  }
-
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
          bool root_of_expr_ast = false) const {
@ -462,7 +364,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
  enum {
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
@ -471,11 +372,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>

  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;

-  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlock;
-  typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
-      TensorBlockWriter;
-
  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
  //===--------------------------------------------------------------------===//
@ -502,15 +398,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
    }
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlock& block) {
-    eigen_assert(this->m_impl.data() != NULL);
-    TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
-                           this->m_inverseShuffle,
-                           this->m_unshuffledInputStrides, this->m_impl.data());
-  }
-
-template <typename TensorBlockV2>
+  template <typename TensorBlockV2>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
      const TensorBlockDesc& desc, const TensorBlockV2& block) {
    eigen_assert(this->m_impl.data() != NULL);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -288,7 +287,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@ -97,7 +97,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -183,7 +183,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
    BlockAccessV2 = false,
    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@ -46,22 +46,6 @@ static DSizes<Index, NumDims> RandomDims() {
  return DSizes<Index, NumDims>(dims);
 }

-/** Dummy data type to test TensorBlock copy ops. */
-struct Data {
-  Data() : value(0) {}
-  explicit Data(int v) : value(v) { }
-  int value;
-};
-
-bool operator==(const Data& lhs, const Data& rhs) {
-  return lhs.value == rhs.value;
-}
-
-std::ostream& operator<<(std::ostream& os, const Data& d) {
-  os << "Data: value=" << d.value;
-  return os;
-}
-
 template <typename T>
 static T* GenerateRandomData(const Index& size) {
  T* data = new T[size];
@ -71,15 +55,6 @@ static T* GenerateRandomData(const Index& size) {
  return data;
 }

-template <>
-Data* GenerateRandomData(const Index& size) {
-  Data* data = new Data[size];
-  for (int i = 0; i < size; ++i) {
-    data[i] = Data(internal::random<int>(1, 100));
-  }
-  return data;
-}
-
 template <int NumDims>
 static void Debug(DSizes<Index, NumDims> dims) {
  for (int i = 0; i < NumDims; ++i) {
@ -183,84 +158,6 @@ static void test_block_mapper_maps_every_element() {
  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
 }

-template <typename T, int NumDims, int Layout>
-static void test_slice_block_mapper_maps_every_element() {
-  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
-  typedef internal::TensorSliceBlockMapper<T, Index, NumDims, Layout> TensorSliceBlockMapper;
-
-  DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
-  DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
-  DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
-
-  // Make sure that tensor offsets + extents do not overflow.
-  for (int i = 0; i < NumDims; ++i) {
-    tensor_slice_offsets[i] =
-        numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
-    tensor_slice_extents[i] = numext::mini(
-        tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
-  }
-
-  // Keep track of elements indices available via block access.
-  std::set<Index> coeff_set;
-
-  int total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
-
-  // Pick a random dimension sizes for the tensor blocks.
-  DSizes<Index, NumDims> block_sizes;
-  for (int i = 0; i < NumDims; ++i) {
-    block_sizes[i] = internal::random<Index>(1, tensor_slice_extents[i]);
-  }
-
-  TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
-                                      tensor_slice_extents, block_sizes,
-                                      DimensionList<Index, NumDims>());
-
-  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
-    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
-                                       choose(Layout, NumDims - 1, 0),
-                                       &coeff_set);
-  }
-
-  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_io_copy_data_from_source_to_target() {
-  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
-      TensorBlockMapper;
-
-  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
-      TensorBlockReader;
-  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
-      TensorBlockWriter;
-
-  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
-  const Index input_tensor_size = input_tensor_dims.TotalSize();
-
-  T* input_data = GenerateRandomData<T>(input_tensor_size);
-  T* output_data = new T[input_tensor_size];
-
-  TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
-                                 RandomTargetSize(input_tensor_dims));
-  T* block_data = new T[block_mapper.block_dims_total_size()];
-
-  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
-    TensorBlockReader::Run(&block, input_data);
-    TensorBlockWriter::Run(block, output_data);
-  }
-
-  for (int i = 0; i < input_tensor_size; ++i) {
-    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
-  }
-
-  delete[] input_data;
-  delete[] output_data;
-  delete[] block_data;
-}
-
 template <int Layout, int NumDims>
 static Index GetInputIndex(Index output_index,
                         const array<Index, NumDims>& output_to_input_dim_map,
@ -304,179 +201,6 @@ static array<Index, NumDims> ComputeStrides(
  return strides;
 }

-template <typename T, int NumDims, int Layout>
-static void test_block_io_copy_using_reordered_dimensions() {
-  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
-      TensorBlockMapper;
-
-  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
-      TensorBlockReader;
-  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
-      TensorBlockWriter;
-
-  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
-  const Index input_tensor_size = input_tensor_dims.TotalSize();
-
-  // Create a random input tensor.
-  T* input_data = GenerateRandomData<T>(input_tensor_size);
-
-  // Create a random dimension re-ordering/shuffle.
-  std::vector<Index> shuffle;
-  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
-  std::random_shuffle(shuffle.begin(), shuffle.end());
-
-  DSizes<Index, NumDims> output_tensor_dims;
-  array<Index, NumDims> input_to_output_dim_map;
-  array<Index, NumDims> output_to_input_dim_map;
-  for (Index i = 0; i < NumDims; ++i) {
-    output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
-    input_to_output_dim_map[i] = shuffle[i];
-    output_to_input_dim_map[shuffle[i]] = i;
-  }
-
-  // Random block shape and size.
-  TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
-                                 RandomTargetSize(input_tensor_dims));
-
-  T* block_data = new T[block_mapper.block_dims_total_size()];
-  T* output_data = new T[input_tensor_size];
-
-  array<Index, NumDims> input_tensor_strides =
-      ComputeStrides<Layout, NumDims>(input_tensor_dims);
-  array<Index, NumDims> output_tensor_strides =
-      ComputeStrides<Layout, NumDims>(output_tensor_dims);
-
-  for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
-    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
-        block.first_coeff_index(), output_to_input_dim_map,
-        input_tensor_strides, output_tensor_strides);
-    TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
-                           input_tensor_strides, input_data);
-    TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
-                           input_tensor_strides, output_data);
-  }
-
-  for (int i = 0; i < input_tensor_size; ++i) {
-    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
-  }
-
-  delete[] input_data;
-  delete[] block_data;
-  delete[] output_data;
-}
-
-// This is the special case for reading data with reordering, when dimensions
-// before/after reordering are the same. Squeezing reads along inner dimensions
-// in this case is illegal, because we reorder innermost dimension.
-template <int Layout>
-static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze()
-{
-  typedef internal::TensorBlock<float, Index, 3, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 3, Layout>
-      TensorBlockReader;
-
-  DSizes<Index, 3> tensor_dims;
-  tensor_dims[0] = 7;
-  tensor_dims[1] = 9;
-  tensor_dims[2] = 7;
-
-  DSizes<Index, 3> block_dims = tensor_dims;
-
-  DSizes<Index, 3> tensor_to_block_dim_map;
-  tensor_to_block_dim_map[0] = 2;
-  tensor_to_block_dim_map[1] = 1;
-  tensor_to_block_dim_map[2] = 0;
-
-  DSizes<Index, 3> tensor_strides(ComputeStrides<Layout, 3>(tensor_dims));
-  DSizes<Index, 3> block_strides(ComputeStrides<Layout, 3>(block_dims));
-
-  const Index tensor_size = tensor_dims.TotalSize();
-  float* tensor_data = GenerateRandomData<float>(tensor_size);
-  float* block_data = new float[tensor_size];
-
-  TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
-  TensorBlockReader::Run(&block,
-                         0,
-                         tensor_to_block_dim_map,
-                         tensor_strides,
-                         tensor_data);
-
-  TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
-  TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
-
-  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
-    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
-      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
-        float block_value = block_tensor(d2, d1, d0);
-        float tensor_value = tensor_tensor(d0, d1, d2);
-        VERIFY_IS_EQUAL(block_value, tensor_value);
-      }
-    }
-  }
-
-  delete[] block_data;
-  delete[] tensor_data;
-}
-
-// This is the special case for reading data with reordering, when dimensions
-// before/after reordering are the same. Squeezing reads in this case is allowed
-// because we reorder outer dimensions.
-template <int Layout>
-static void test_block_io_copy_using_reordered_dimensions_squeeze()
-{
-  typedef internal::TensorBlock<float, Index, 4, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 4, Layout>
-      TensorBlockReader;
-
-  DSizes<Index, 4> tensor_dims;
-  tensor_dims[0] = 7;
-  tensor_dims[1] = 5;
-  tensor_dims[2] = 9;
-  tensor_dims[3] = 9;
-
-  DSizes<Index, 4> block_dims = tensor_dims;
-
-  DSizes<Index, 4> tensor_to_block_dim_map;
-  tensor_to_block_dim_map[0] = 0;
-  tensor_to_block_dim_map[1] = 1;
-  tensor_to_block_dim_map[2] = 3;
-  tensor_to_block_dim_map[3] = 2;
-
-  DSizes<Index, 4> tensor_strides(ComputeStrides<Layout, 4>(tensor_dims));
-  DSizes<Index, 4> block_strides(ComputeStrides<Layout, 4>(block_dims));
-
-  const Index tensor_size = tensor_dims.TotalSize();
-  float* tensor_data = GenerateRandomData<float>(tensor_size);
-  float* block_data = new float[tensor_size];
-
-  TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
-  TensorBlockReader::Run(&block,
-                         0,
-                         tensor_to_block_dim_map,
-                         tensor_strides,
-                         tensor_data);
-
-  TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
-  TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
-
-  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
-    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
-      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
-        for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
-          float block_value = block_tensor(d0, d1, d3, d2);
-          float tensor_value = tensor_tensor(d0, d1, d2, d3);
-          VERIFY_IS_EQUAL(block_value, tensor_value);
-        }
-      }
-    }
-  }
-
-  delete[] block_data;
-  delete[] tensor_data;
-}
-
 template<typename Scalar, typename StorageIndex, int Dim>
 class EqualityChecker
 {
@ -510,365 +234,6 @@ public:
    }
 };

-template <int Layout>
-static void test_block_io_zero_stride()
-{
-  typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 5, Layout>
-      TensorBlockReader;
-  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
-      TensorBlockWriter;
-
-  DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
-  DSizes<Index, 5> input_tensor_dims = rnd_dims;
-  input_tensor_dims[0] = 1;
-  input_tensor_dims[2] = 1;
-  input_tensor_dims[4] = 1;
-  const Index input_tensor_size = input_tensor_dims.TotalSize();
-  float* input_data = GenerateRandomData<float>(input_tensor_size);
-
-  DSizes<Index, 5> output_tensor_dims = rnd_dims;
-
-  DSizes<Index, 5> input_tensor_strides(
-      ComputeStrides<Layout, 5>(input_tensor_dims));
-  DSizes<Index, 5> output_tensor_strides(
-      ComputeStrides<Layout, 5>(output_tensor_dims));
-
-  DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
-  input_tensor_strides_with_zeros[0] = 0;
-  input_tensor_strides_with_zeros[2] = 0;
-  input_tensor_strides_with_zeros[4] = 0;
-
-  // Verify that data was correctly read/written from/into the block.
-  const EqualityChecker<float, Index, 5> verify_is_equal(input_data, input_tensor_dims, input_tensor_strides, output_tensor_dims, output_tensor_strides);
-
-  {
-    float* output_data = new float[output_tensor_dims.TotalSize()];
-    TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
-                           input_tensor_strides_with_zeros, output_data);
-    TensorBlockReader::Run(&read_block, input_data);
-    verify_is_equal(output_data);
-    delete[] output_data;
-  }
-
-  {
-    float* output_data = new float[output_tensor_dims.TotalSize()];
-    TensorBlock write_block(0, output_tensor_dims,
-                            input_tensor_strides_with_zeros,
-                            output_tensor_strides, input_data);
-    TensorBlockWriter::Run(write_block, output_data);
-    verify_is_equal(output_data);
-    delete[] output_data;
-  }
-
-  delete[] input_data;
-}
-
-template <int Layout>
-static void test_block_io_squeeze_ones() {
-  typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 5, Layout>
-      TensorBlockReader;
-  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
-      TensorBlockWriter;
-
-  // Total size > 1.
-  {
-    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
-    const Index total_size = block_sizes.TotalSize();
-
-    // Create a random input tensor.
-    float* input_data = GenerateRandomData<float>(total_size);
-    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
-    {
-      float* output_data = new float[block_sizes.TotalSize()];
-      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
-      TensorBlockReader::Run(&read_block, input_data);
-      for (int i = 0; i < total_size; ++i) {
-        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
-      }
-      delete[] output_data;
-    }
-
-    {
-      float* output_data = new float[block_sizes.TotalSize()];
-      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
-      TensorBlockWriter::Run(write_block, output_data);
-      for (int i = 0; i < total_size; ++i) {
-        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
-      }
-      delete[] output_data;
-    }
-  }
-
-  // Total size == 1.
-  {
-    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
-    const Index total_size = block_sizes.TotalSize();
-
-    // Create a random input tensor.
-    float* input_data = GenerateRandomData<float>(total_size);
-    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
-    {
-      float* output_data = new float[block_sizes.TotalSize()];
-      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
-      TensorBlockReader::Run(&read_block, input_data);
-      for (int i = 0; i < total_size; ++i) {
-        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
-      }
-      delete[] output_data;
-    }
-
-    {
-      float* output_data = new float[block_sizes.TotalSize()];
-      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
-      TensorBlockWriter::Run(write_block, output_data);
-      for (int i = 0; i < total_size; ++i) {
-        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
-      }
-      delete[] output_data;
-    }
-  }
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_cwise_unary_io_basic() {
-  typedef internal::scalar_square_op<T> UnaryFunctor;
-  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
-                                            Layout>
-      TensorBlockCwiseUnaryIO;
-
-  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
-  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
-
-  const Index total_size = block_sizes.TotalSize();
-
-  // Create a random input tensors.
-  T* input_data = GenerateRandomData<T>(total_size);
-
-  T* output_data = new T[total_size];
-  UnaryFunctor functor;
-  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
-                               strides, input_data);
-  for (int i = 0; i < total_size; ++i) {
-    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
-  }
-
-  delete[] input_data;
-  delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_unary_io_squeeze_ones() {
-  typedef internal::scalar_square_op<float> UnaryFunctor;
-  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
-                                            Layout>
-      TensorBlockCwiseUnaryIO;
-
-  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
-  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
-  const Index total_size = block_sizes.TotalSize();
-
-  // Create a random input tensors.
-  float* input_data = GenerateRandomData<float>(total_size);
-
-  float* output_data = new float[total_size];
-  UnaryFunctor functor;
-  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
-                               strides, input_data);
-  for (int i = 0; i < total_size; ++i) {
-    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
-  }
-
-  delete[] input_data;
-  delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_unary_io_zero_strides() {
-  typedef internal::scalar_square_op<float> UnaryFunctor;
-  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
-                                            Layout>
-      TensorBlockCwiseUnaryIO;
-
-  DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
-  DSizes<Index, 5> input_sizes = rnd_dims;
-  input_sizes[0] = 1;
-  input_sizes[2] = 1;
-  input_sizes[4] = 1;
-
-  DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
-  input_strides[0] = 0;
-  input_strides[2] = 0;
-  input_strides[4] = 0;
-
-  // Generate random data.
-  float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
-
-  DSizes<Index, 5> output_sizes = rnd_dims;
-  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
-
-  const Index output_total_size = output_sizes.TotalSize();
-  float* output_data = new float[output_total_size];
-
-  UnaryFunctor functor;
-  TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
-                               output_data, input_strides, input_data);
-  for (int i = 0; i < rnd_dims[0]; ++i) {
-    for (int j = 0; j < rnd_dims[1]; ++j) {
-      for (int k = 0; k < rnd_dims[2]; ++k) {
-        for (int l = 0; l < rnd_dims[3]; ++l) {
-          for (int m = 0; m < rnd_dims[4]; ++m) {
-            Index output_index = i * output_strides[0] + j * output_strides[1] +
-                                 k * output_strides[2] + l * output_strides[3] +
-                                 m * output_strides[4];
-            Index input_index = i * input_strides[0] + j * input_strides[1] +
-                                k * input_strides[2] + l * input_strides[3] +
-                                m * input_strides[4];
-            VERIFY_IS_EQUAL(output_data[output_index],
-                            functor(input_data[input_index]));
-          }
-        }
-      }
-    }
-  }
-
-  delete[] input_data;
-  delete[] output_data;
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_cwise_binary_io_basic() {
-  typedef internal::scalar_sum_op<T> BinaryFunctor;
-  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
-                                             Layout>
-      TensorBlockCwiseBinaryIO;
-
-  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
-  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
-
-  const Index total_size = block_sizes.TotalSize();
-
-  // Create a random input tensors.
-  T* left_data = GenerateRandomData<T>(total_size);
-  T* right_data = GenerateRandomData<T>(total_size);
-
-  T* output_data = new T[total_size];
-  BinaryFunctor functor;
-  TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
-                                strides, left_data, strides, right_data);
-  for (int i = 0; i < total_size; ++i) {
-    VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
-  }
-
-  delete[] left_data;
-  delete[] right_data;
-  delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_binary_io_squeeze_ones() {
-  typedef internal::scalar_sum_op<float> BinaryFunctor;
-  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
-                                             Layout>
-      TensorBlockCwiseBinaryIO;
-
-  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
-  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
-  const Index total_size = block_sizes.TotalSize();
-
-  // Create a random input tensors.
-  float* left_data = GenerateRandomData<float>(total_size);
-  float* right_data = GenerateRandomData<float>(total_size);
-
-  float* output_data = new float[total_size];
-  BinaryFunctor functor;
-  TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
-                                strides, left_data, strides, right_data);
-  for (int i = 0; i < total_size; ++i) {
-    VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
-  }
-
-  delete[] left_data;
-  delete[] right_data;
-  delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_binary_io_zero_strides() {
-  typedef internal::scalar_sum_op<float> BinaryFunctor;
-  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
-                                             Layout>
-      TensorBlockCwiseBinaryIO;
-
-  DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
-  DSizes<Index, 5> left_sizes = rnd_dims;
-  left_sizes[0] = 1;
-  left_sizes[2] = 1;
-  left_sizes[4] = 1;
-
-  DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
-  left_strides[0] = 0;
-  left_strides[2] = 0;
-  left_strides[4] = 0;
-
-  DSizes<Index, 5> right_sizes = rnd_dims;
-  right_sizes[1] = 1;
-  right_sizes[3] = 1;
-
-  DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
-  right_strides[1] = 0;
-  right_strides[3] = 0;
-
-  // Generate random data.
-  float* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
-  float* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
-
-  DSizes<Index, 5> output_sizes = rnd_dims;
-  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
-
-  const Index output_total_size = output_sizes.TotalSize();
-  float* output_data = new float[output_total_size];
-
-  BinaryFunctor functor;
-  TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
-                                output_data, left_strides, left_data,
-                                right_strides, right_data);
-  for (int i = 0; i < rnd_dims[0]; ++i) {
-    for (int j = 0; j < rnd_dims[1]; ++j) {
-      for (int k = 0; k < rnd_dims[2]; ++k) {
-        for (int l = 0; l < rnd_dims[3]; ++l) {
-          for (int m = 0; m < rnd_dims[4]; ++m) {
-            Index output_index = i * output_strides[0] + j * output_strides[1] +
-                                 k * output_strides[2] + l * output_strides[3] +
-                                 m * output_strides[4];
-            Index left_index = i * left_strides[0] + j * left_strides[1] +
-                               k * left_strides[2] + l * left_strides[3] +
-                               m * left_strides[4];
-            Index right_index = i * right_strides[0] + j * right_strides[1] +
-                                k * right_strides[2] + l * right_strides[3] +
-                                m * right_strides[4];
-            VERIFY_IS_EQUAL(
-                output_data[output_index],
-                functor(left_data[left_index], right_data[right_index]));
-          }
-        }
-      }
-    }
-  }
-
-  delete[] left_data;
-  delete[] right_data;
-  delete[] output_data;
-}
-
 template <int Layout>
 static void test_uniform_block_shape()
 {
@ -1196,21 +561,6 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
 EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
  TEST_LAYOUTS(test_block_mapper_sanity);
  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
-  TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
-  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
-  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
-  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
-  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
-  TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_do_not_squeeze);
-  TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_squeeze);
-  TEST_LAYOUTS(test_block_io_zero_stride);
-  TEST_LAYOUTS(test_block_io_squeeze_ones);
-  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
-  TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
-  TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
-  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
-  TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
-  TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
  TEST_LAYOUTS(test_uniform_block_shape);
  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
  TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kUniformAllDims);
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@ -310,48 +310,6 @@ static void test_execute_shuffle_lvalue(Device d)
  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 }

-template <typename T, int NumDims, typename Device, bool Vectorizable,
-          TiledEvaluation Tiling, int Layout>
-static void test_execute_reduction(Device d)
-{
-  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
-
-  static constexpr int ReducedDims = NumDims - 2;
-  static constexpr int Options = 0 | Layout;
-
-  auto dims = RandomDims<NumDims>(5, 10);
-  Tensor<T, NumDims, Options, Index> src(dims);
-  src.setRandom();
-
-  // Pick two random and unique reduction dimensions.
-  int reduction0 = internal::random<int>(0, NumDims - 1);
-  int reduction1 = internal::random<int>(0, NumDims - 1);
-  while (reduction0 == reduction1) {
-    reduction1 = internal::random<int>(0, NumDims - 1);
-  }
-
-  DSizes<Index, 2> reduction_axis;
-  reduction_axis[0] = reduction0;
-  reduction_axis[1] = reduction1;
-
-  Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
-
-  // Now do the reduction using configured tensor executor.
-  Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
-
-  auto expr = src.sum(reduction_axis);
-
-  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
-  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
-
-  Executor::run(Assign(dst, expr), d);
-
-  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
-    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
-  }
-}
-
 template <typename T, int NumDims, typename Device, bool Vectorizable,
    TiledEvaluation Tiling, int Layout>
 static void test_execute_reshape(Device d)
@ -663,57 +621,34 @@ static void test_async_execute_binary_expr(Device d)
 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART

-#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS)                                                                              \
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))
-
-  // NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
-#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS)                                                                              \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,      ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,      ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,      RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,      RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))

 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
 #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))

 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::DefaultDevice default_device;
@ -724,69 +659,64 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::ThreadPool tp(num_threads);
  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);

-  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);

-  CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 5);
-
-  CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 1);
-  CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);

  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);