Tensor block evaluation V2 support for unary/binary/broadcsting

2025-09-14 02:13:13 +08:00 · 2019-09-24 12:52:45 -07:00 · 2019-09-24 12:52:45 -07:00 · ef9dfee7bd
commit ef9dfee7bd
parent efd9867ff0
40 changed files with 1162 additions and 169 deletions
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -78,9 +78,6 @@ struct conditional { typedef Then type; };
 template<typename Then, typename Else>
 struct conditional <false, Then, Else> { typedef Else type; };

-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
 template<typename T> struct remove_reference { typedef T type; };
 template<typename T> struct remove_reference<T&> { typedef T type; };

@ -115,6 +112,12 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };

+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template< class T >
+struct is_void : is_same<void, typename remove_const<T>::type> {};
+
 #if EIGEN_HAS_CXX11
 template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -114,6 +114,7 @@ typedef unsigned __int64 uint64_t;

 #include "src/Tensor/TensorBase.h"
 #include "src/Tensor/TensorBlock.h"
+#include "src/Tensor/TensorBlockV2.h"

 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@ -89,12 +89,17 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device) { }

@ -226,12 +231,17 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_orig_impl(op.expression(), device),
        m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@ -109,12 +109,17 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
    IsAligned =  false,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
    CoordAccess = false,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,  const SyclKernelDevice& device)
      : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
      m_stride_div(op.stride_div()){}
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -110,6 +110,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
                        TensorEvaluator<RightArgType, Device>::PacketAccess,
    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
+                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@ -120,6 +122,18 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
      typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
      TensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+      RightTensorBlock;
+
+  typedef internal::TensorBlockAssignment<
+      Scalar, NumDims, typename RightTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
      m_leftImpl(op.lhsExpression(), device),
      m_rightImpl(op.rhsExpression(), device)
@ -214,6 +228,29 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
      m_leftImpl.writeBlock(*block);
    }
  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.AddDestinationBuffer(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
+          /*total_dst_bytes=*/
+          (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
+    }
+
+    RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
+    // If block was evaluated into a destination, there is no need to do
+    // assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlockV2(desc, block);
+    }
+    block.cleanup();
+  }
+
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@ -1025,6 +1025,11 @@ class TensorBlockMapper {
    return m_block_dim_sizes.TotalSize();
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&
+  block_dim_sizes() const {
+    return m_block_dim_sizes;
+  }
+
 private:
  static Dimensions BlockDimensions(const Dimensions& tensor_dims,
                                    const TensorBlockShapeType block_shape,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -115,6 +115,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    IsAligned         = true,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess         = false
@ -131,11 +132,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  // We do block based broadcasting using a trick with 2x tensor rank and 0
  // strides. See block method implementation for details.
  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
  typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
      BroadcastTensorBlock;
  typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
      BroadcastTensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
      : isCopy(false), nByOne(false), oneByN(false),
@ -867,6 +881,292 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    }
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    static const bool
+        is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Return a block with a single scalar.
+    if (NumDims <= 0) return scalarBlock(scratch);
+
+    // Because we only support kSkewedInnerDims blocking, block size should be
+    // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
+    // for the first outer dim, and 1 for other outer dims. This is guaranteed
+    // by MergeResourceRequirements() in TensorBlock.h.
+    const Dimensions& output_dims = desc.dimensions();
+    const Dimensions output_strides = internal::strides<Layout>(output_dims);
+
+    // Find where outer dims start.
+    int outer_dim_start = 0;
+    Index outer_dim_size = 1;
+    Index inner_dim_size = 1;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - i - 1;
+
+      if (i > outer_dim_start) {
+        eigen_assert(output_dims[dim] == 1);
+      } else if (output_dims[dim] != m_dimensions[dim]) {
+        eigen_assert(output_dims[dim] < m_dimensions[dim]);
+        outer_dim_size = output_dims[dim];
+      } else {
+        inner_dim_size *= output_dims[dim];
+        ++outer_dim_start;
+      }
+    }
+
+    if (inner_dim_size == 0 || outer_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    const Dimensions& input_dims = Dimensions(m_impl.dimensions());
+
+    // Pre-fill input_block_sizes, broadcast_block_sizes,
+    // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
+    // only modify the outer_dim_start-th dimension on these arrays.
+
+    // Calculate the input block size for looking into the input.
+    Dimensions input_block_sizes;
+    for (int i = 0; i < outer_dim_start; ++i) {
+      const int dim = is_col_major ? i : NumDims -i - 1;
+      input_block_sizes[dim] = input_dims[dim];
+    }
+    for (int i = outer_dim_start; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims -i - 1;
+      input_block_sizes[dim] = 1;
+    }
+    Dimensions input_block_strides =
+        internal::strides<Layout>(input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+
+    for (int i = 0; i < outer_dim_start; ++i) {
+      const int dim = is_col_major ? i : NumDims - i - 1;
+
+      const int copy_dim = is_col_major ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = is_col_major ? copy_dim + 1 : copy_dim - 1;
+
+      bcast_block_sizes[copy_dim] = input_dims[dim];
+      bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      bcast_block_strides[copy_dim] = output_strides[dim];
+      bcast_block_strides[broadcast_dim] =
+          output_strides[dim] * input_dims[dim];
+      bcast_input_strides[copy_dim] = input_block_strides[dim];
+      bcast_input_strides[broadcast_dim] = 0;
+    }
+    for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
+      const int dim = is_col_major ? i : 2 * NumDims - i - 1;
+      bcast_block_sizes[dim] = 1;
+      bcast_block_strides[dim] = 0;
+      bcast_input_strides[dim] = 0;
+    }
+
+    const int outer_dim =
+        is_col_major ? outer_dim_start : NumDims - outer_dim_start - 1;
+
+    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
+    ScalarNoConst* materialized_output =
+        desc.template destination<ScalarNoConst, Layout>();
+    bool materialized_in_output;
+
+    if (materialized_output != NULL) {
+      desc.DropDestinationBuffer();
+      materialized_in_output = true;
+
+    } else {
+      materialized_in_output = false;
+      const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+      void* output_scratch_mem = scratch.allocate(materialized_output_size);
+      materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
+    }
+
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    if (outer_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      BroadcastBlockV2(
+          input_block_sizes, input_block_strides, bcast_block_sizes,
+          bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+          materialized_output, &materialized_input, &materialized_input_size);
+
+    } else if (input_dims[outer_dim] == 1) {
+      // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
+      const int broadcast_outer_dim =
+          is_col_major ? 2 * outer_dim_start + 1
+                       : 2 * NumDims - 2 * outer_dim_start - 2;
+
+      bcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
+      bcast_input_strides[broadcast_outer_dim] = 0;
+      bcast_block_strides[broadcast_outer_dim] = output_strides[outer_dim];
+
+      BroadcastBlockV2(
+          input_block_sizes, input_block_strides, bcast_block_sizes,
+          bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+          materialized_output, &materialized_input, &materialized_input_size);
+
+    } else {
+      // The general case. Let's denote the output block as x[...,
+      // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
+      // the outer_dim_start-th dimension (< NumDims). We need to split the
+      // a:a+outer_dim_size into possibly 3 sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      // input_dims[outer_dim_start] in [a, a+outer_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
+      // in [a, a+outer_dim_size].
+      //
+      // (3) c:a+outer_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index outer_dim_left_index =
+          desc.offset() / m_outputStrides[outer_dim];
+
+      // Find b and c.
+      const Index input_outer_dim_size = input_dims[outer_dim];
+
+      // First multiple after a. This is b when <= outer_dim_left_index +
+      // outer_dim_size.
+      const Index first_multiple =
+          divup<Index>(outer_dim_left_index, input_outer_dim_size) *
+          input_outer_dim_size;
+
+      if (first_multiple <= outer_dim_left_index + outer_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
+                                    input_outer_dim_size * input_outer_dim_size;
+        const int copy_outer_dim = is_col_major
+                                       ? 2 * outer_dim_start
+                                       : 2 * NumDims - 2 * outer_dim_start - 1;
+        const int broadcast_outer_dim =
+            is_col_major ? 2 * outer_dim_start + 1
+                         : 2 * NumDims - 2 * outer_dim_start - 2;
+
+        if (first_multiple > outer_dim_left_index) {
+          const Index head_size = first_multiple - outer_dim_left_index;
+          input_block_sizes[outer_dim] = head_size;
+          bcast_block_sizes[copy_outer_dim] = head_size;
+          bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+          bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+          bcast_block_sizes[broadcast_outer_dim] = 1;
+          bcast_input_strides[broadcast_outer_dim] = 0;
+          bcast_block_strides[broadcast_outer_dim] =
+              output_strides[outer_dim] * input_dims[outer_dim];
+
+          BroadcastBlockV2(input_block_sizes, input_block_strides,
+                           bcast_block_sizes, bcast_block_strides,
+                           bcast_input_strides, 0, desc, scratch,
+                           materialized_output, &materialized_input,
+                           &materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          input_block_sizes[outer_dim] = input_outer_dim_size;
+          bcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
+          bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+          bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+          bcast_block_sizes[broadcast_outer_dim] =
+              (last_multiple - first_multiple) / input_outer_dim_size;
+          bcast_input_strides[broadcast_outer_dim] = 0;
+          bcast_block_strides[broadcast_outer_dim] =
+              output_strides[outer_dim] * input_dims[outer_dim];
+          const Index offset = (first_multiple - outer_dim_left_index) *
+                               m_outputStrides[outer_dim];
+
+          BroadcastBlockV2(input_block_sizes, input_block_strides,
+                           bcast_block_sizes, bcast_block_strides,
+                           bcast_input_strides, offset, desc, scratch,
+                           materialized_output, &materialized_input,
+                           &materialized_input_size);
+        }
+        if (last_multiple < outer_dim_left_index + outer_dim_size) {
+          const Index tail_size =
+              outer_dim_left_index + outer_dim_size - last_multiple;
+          input_block_sizes[outer_dim] = tail_size;
+          bcast_block_sizes[copy_outer_dim] = tail_size;
+          bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+          bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+          bcast_block_sizes[broadcast_outer_dim] = 1;
+          bcast_input_strides[broadcast_outer_dim] = 0;
+          bcast_block_strides[broadcast_outer_dim] =
+              output_strides[outer_dim] * input_dims[outer_dim];
+          const Index offset = (last_multiple - outer_dim_left_index) *
+                               m_outputStrides[outer_dim];
+
+          BroadcastBlockV2(input_block_sizes, input_block_strides,
+                           bcast_block_sizes, bcast_block_strides,
+                           bcast_input_strides, offset, desc, scratch,
+                           materialized_output, &materialized_input,
+                           &materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_outer_dim = is_col_major
+                                       ? 2 * outer_dim_start
+                                       : 2 * NumDims - 2 * outer_dim_start - 1;
+        input_block_sizes[outer_dim] = outer_dim_size;
+        bcast_block_sizes[copy_outer_dim] = outer_dim_size;
+        bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+        bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+
+        BroadcastBlockV2(
+            input_block_sizes, input_block_strides, bcast_block_sizes,
+            bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+            materialized_output, &materialized_input, &materialized_input_size);
+      }
+    }
+
+    return TensorBlockV2(materialized_in_output
+                         ? internal::TensorBlockKind::kMaterializedInOutput
+                         : internal::TensorBlockKind::kMaterializedInScratch,
+                         materialized_output,
+                         desc.dimensions());
+  }
+
+  // This is a special case for `NumDims == 0`, in practice this should not
+  // happen often, so it's fine to do memory allocation just for a scalar.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  scalarBlock(TensorBlockScratch& scratch) const {
+    void* mem = scratch.allocate(sizeof(Scalar));
+    ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+    *buf = m_impl.coeff(0);
+
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+
+    return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, buf,
+                         dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@ -901,6 +1201,73 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlockV2(
+      const Dimensions& input_block_sizes,
+      const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes,
+      const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index offset,
+      const TensorBlockDesc& output_desc, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = output_desc.offset() + offset;
+    static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+    TensorBlockDesc input_desc(is_col_major
+                               ? indexColMajor(input_offset)
+                               : indexRowMajor(input_offset),
+                               input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL ||
+          *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      typename TensorBlockAssignment::Dst assignment_dst(
+          input_block_sizes, input_block_strides, *materialized_input);
+
+      TensorBlockAssignment::Run(assignment_dst, input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, 2 * NumDims, Layout>
+        TensorBlockIOV2;
+
+    typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides,
+                                      materialized_output + offset);
+
+    TensorBlockIOV2::Copy(dst, src);
+  }
+
 protected:
  const Device EIGEN_DEVICE_REF m_device;
  const typename internal::remove_reference<Broadcast>::type m_broadcast;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -149,6 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = false,
    // Chipping of outer-most dimension is a trivial operation, because we can
    // read and write directly from the underlying tensor using single offset.
    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
@ -169,6 +170,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
      OutputTensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -125,11 +125,16 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
    IsAligned = false,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
  {
@ -318,11 +323,16 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
    IsAligned = false,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
    : Base(op, device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -376,12 +376,17 @@ struct TensorContractionEvaluatorBase
    IsAligned         = true,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess       = false,
+    BlockAccessV2     = false,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
    RawAccess         = true
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  // Most of the code is assuming that both input tensors are ColMajor. If the
  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -303,11 +303,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
    #endif
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : m_impl(op.expression(), device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -310,12 +310,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
  {
@ -783,12 +788,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -243,12 +243,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -96,12 +96,17 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<XprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
      : m_op(op), m_device(device), m_result(NULL)
  {
@ -265,12 +270,17 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_op(op), m_device(device), m_result(NULL)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@ -383,8 +383,17 @@ struct DSizes : array<DenseIndex, NumDims> {
  }
 };

-
-
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+                         const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}

 // Boilerplate
 namespace internal {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = true,
+    BlockAccessV2     = false,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
@ -124,6 +125,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
      TensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -43,13 +43,14 @@ struct TensorEvaluator
                               internal::traits<Derived>::NumDimensions : 0;

  enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
-    PreferBlockAccess = false,
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    IsAligned          = Derived::IsAligned,
+    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    BlockAccessV2      = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess  = false,
+    Layout             = Derived::Layout,
+    CoordAccess        = NumCoords > 0,
+    RawAccess          = true
  };

  typedef typename internal::TensorBlock<
@ -62,6 +63,10 @@ struct TensorEvaluator
      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
      TensorBlockWriter;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
        m_dims(m.dimensions()),
@ -162,6 +167,22 @@ struct TensorEvaluator
    TensorBlockWriter::Run(block, m_data);
  }

+  template<typename TensorBlockV2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+    assert(m_data != NULL);
+
+    typedef typename TensorBlockV2::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+                                            Index>
+        TensorBlockAssign;
+    typename TensorBlockAssign::Dst dst(desc.dimensions(),
+                                        internal::strides<Layout>(m_dims),
+                                        m_data, desc.offset());
+
+    TensorBlockAssign::Run(dst, block.expr());
+  }
+
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }

 #ifdef EIGEN_USE_SYCL
@ -220,28 +241,43 @@ struct TensorEvaluator<const Derived, Device>
  typedef StorageMemory<const Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;

+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
  // NumDimensions is -1 for variable dim tensors
  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                               internal::traits<Derived>::NumDimensions : 0;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;

  enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    IsAligned         = Derived::IsAligned,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
+    BlockAccessV2     = internal::is_arithmetic<ScalarNoConst>::value,
    PreferBlockAccess = false,
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    Layout            = Derived::Layout,
+    CoordAccess       = NumCoords > 0,
+    RawAccess         = true
  };

-  typedef typename internal::TensorBlock<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+  typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
      TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+  typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
      TensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
+      TensorBlockIO;
+  typedef typename TensorBlockIO::Dst TensorBlockIODst;
+  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
  { }
@ -310,6 +346,67 @@ struct TensorEvaluator<const Derived, Device>
    TensorBlockReader::Run(block, m_data);
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    assert(m_data != NULL);
+
+    // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
+
+    // If a tensor block descriptor covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `m_data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   m_dims:             [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
+
+    static const bool
+        is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumCoords; ++i) {
+      int dim = is_col_major ? i : NumCoords - i - 1;
+      if (m_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
+      int dim = is_col_major ? i : NumCoords - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      EvaluatorPointerType block_start = m_data + desc.offset();
+      return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
+                           desc.dimensions());
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
+
+      TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
+                           desc.offset());
+      TensorBlockIODst dst(desc.dimensions(),
+                           internal::strides<Layout>(desc.dimensions()),
+                           block_buffer);
+
+      TensorBlockIO::Copy(dst, src);
+
+      return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
+                           block_buffer, desc.dimensions());
+    }
+  }
+
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
@ -355,12 +452,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
    #endif
    ,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
@ -421,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
                         internal::functor_traits<UnaryOp>::PacketAccess,
    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2      = TensorEvaluator<ArgType, Device>::BlockAccessV2,
    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
    Layout             = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess        = false,  // to be implemented
@ -446,6 +549,17 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
      TensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+      TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
@ -505,6 +619,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
                                                   arg_block.data());
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
+  }
+
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

 #ifdef EIGEN_USE_SYCL
@ -537,6 +656,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
                        internal::functor_traits<BinaryOp>::PacketAccess,
    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
+                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@ -571,6 +692,20 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
      TensorEvaluator<LeftArgType, Device>::Layout>
      TensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlockV2
+      LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+      RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+                                           RightTensorBlock>
+      TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
  {
    // TODO: use right impl instead if right impl dimensions are known at compile time.
@ -642,6 +777,13 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
                     right_block.block_strides(), right_block.data());
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    desc.DropDestinationBuffer();
+    return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
+                         m_rightImpl.blockV2(desc, scratch), m_functor);
+  }
+
  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }

  #ifdef EIGEN_USE_SYCL
@ -670,6 +812,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
                   internal::functor_traits<TernaryOp>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -709,6 +852,10 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
  {
    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
@ -780,6 +927,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
                    PacketType<Scalar, Device>::HasBlend,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<IfArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -805,6 +953,10 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
  {
    // TODO: use then or else impl instead if they happen to be known at compile time.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -23,7 +23,7 @@ namespace Eigen {
 *
 * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
 *                      instructions)
- * @tparam Tileable     can use block based tensor evaluation
+ * @tparam Tiling       can use block based tensor evaluation
 *                      (see TensorBlock.h)
 */
 namespace internal {
@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp<
 * Default strategy: the expression is evaluated sequentially with a single cpu
 * thread, without vectorization and block evaluation.
 */
+#if EIGEN_HAS_CXX11
 template <typename Expression, typename Device, bool Vectorizable,
-          bool Tileable>
+          TiledEvaluation Tiling>
+#else
+ template <typename Expression, typename Device, bool Vectorizable,
+          TiledEvaluation::TiledEvaluation Tiling>
+#endif
 class TensorExecutor {
 public:
  typedef typename Expression::Index StorageIndex;
@ -109,8 +114,8 @@ class TensorAsyncExecutor {};
 * Process all the data with a single cpu thread, using vectorized instructions.
 */
 template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
-                     /*Tileable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
 public:
  typedef typename Expression::Index StorageIndex;

@ -152,7 +157,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
 */
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                     /*Tileable*/ true> {
+                     /*Tiling=*/TiledEvaluation::Legacy> {
 public:
  typedef typename traits<Expression>::Scalar Scalar;
  typedef typename remove_const<Scalar>::type ScalarNoConst;
@ -176,8 +181,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
    if (total_size < cache_size
        && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
-          /*Tileable*/ false>::run(expr, device);
+      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
      evaluator.cleanup();
      return;
    }
@ -211,6 +215,70 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
  }
 };

+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
+    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
+    typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+
+    typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+        TensorBlockScratch;
+
+    Evaluator evaluator(expr, device);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+    // TODO(ezhulenev): Do not use tiling for small tensors?
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+    if (needs_assign) {
+      // Size tensor blocks to fit in cache (or requested target block size).
+      Index block_total_size = numext::mini(cache_size, total_size);
+      TensorBlockShapeType block_shape = kSkewedInnerDims;
+      // Query expression tree for desired block size/shape.
+      std::vector<TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+      TensorBlockMapper block_mapper(
+          TensorBlockDimensions(evaluator.dimensions()), block_shape,
+          block_total_size);
+      block_total_size = block_mapper.block_dims_total_size();
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.total_block_count();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
+
+        TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+        evaluator.evalBlockV2(desc, scratch);
+        scratch.reset();
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
 /**
 * Multicore strategy: the index space is partitioned and each partition is
 * executed on a single core.
@ -256,10 +324,11 @@ struct TensorExecutorTilingContext {
 };

 // Computes a block evaluation parameters, and allocates temporary memory buffer
-// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
-    const ThreadPoolDevice& device, const Evaluator& evaluator) {
+    const ThreadPoolDevice& device, const Evaluator& evaluator,
+    bool allocate_buffer = true) {
  // Prefer blocks skewed toward inner dimension.
  TensorBlockShapeType block_shape = kSkewedInnerDims;
  Index block_total_size = 0;
@ -284,7 +353,13 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
  const size_t aligned_blocksize =
      align *
      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
-  void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+
+  // TODO(ezhulenev): In new block evaluation framework there is no need for
+  // allocating temporary buffers, remove this after migration.
+  void* buf = NULL;
+  if (allocate_buffer) {
+    buf = device.allocate((num_threads + 1) * aligned_blocksize);
+  }

  return {block_mapper, cost * block_size, buf, aligned_blocksize};
 }
@ -344,8 +419,8 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
  }
 };

-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
 public:
  typedef typename Expression::Index StorageIndex;

@ -369,7 +444,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 };

 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::Legacy> {
 public:
  typedef typename traits<Expression>::Index StorageIndex;
  typedef typename traits<Expression>::Scalar Scalar;
@ -387,11 +463,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
    Index total_size = array_prod(evaluator.dimensions());
    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);

-    if (total_size < cache_size
-        && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
+    if (total_size < cache_size &&
+        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
      // TODO(andydavis) Reduce block management overhead for small tensors.
      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-          /*Tileable*/ false>::run(expr, device);
+                               /*Tiling=*/TiledEvaluation::Off>::run(expr,
+                                                                     device);
      evaluator.cleanup();
      return;
    }
@ -419,6 +496,57 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
  }
 };

+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
+                            Evaluator::Layout>
+      BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+      TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const ThreadPoolDevice& device) {
+    Evaluator evaluator(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(
+              device, evaluator, /*allocate_buffer=*/false);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+                                                       IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+          auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
+          TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+          evaluator.evalBlockV2(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost,
+                         eval_block);
+    }
+    evaluator.cleanup();
+  }
+};
+
 template <typename Expression, typename DoneCallback, bool Vectorizable,
          bool Tileable>
 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
@ -562,8 +690,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)

-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
 public:
  typedef typename Expression::Index StorageIndex;
  static void run(const Expression& expr, const GpuDevice& device);
@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
 }

 /*static*/
-template <typename Expression, bool Vectorizable, bool Tileable>
-EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
    const Expression& expr, const GpuDevice& device) {
  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
@ -711,8 +839,8 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
            range_, vectorizable_threads_, evaluator) {}
 };

-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
  public:
  typedef typename Expression::Index Index;
   static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@ -138,12 +138,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
    IsAligned = false,
    PacketAccess = true,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    for (int i = 0; i < NumDims; ++i) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@ -42,12 +42,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
      BlockAccess = false,
+      BlockAccessV2 = false,
      PreferBlockAccess = false,
      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
      CoordAccess = true,
      RawAccess = true
    };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  typedef Dimensions_ Dimensions;
  static const std::size_t NumIndices = Dimensions::count;

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
    IsAligned         = true,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
+    BlockAccessV2     = false,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess         = true
@ -109,6 +110,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
      TensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_op(op.expression()),
      m_device(device), m_buffer(NULL)
@ -132,13 +137,13 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
    #endif
    typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
-    const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
-    const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
-                      TensorEvaluator<const ArgType, Device>::PreferBlockAccess;

-    internal::TensorExecutor<const EvalTo,
-                             typename internal::remove_const<Device>::type,
-                             Vectorize, Tile>::run(evalToTmp, m_device);
+    internal::TensorExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        run(evalToTmp, m_device);
+
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -154,23 +154,61 @@ struct IsVectorizable<GpuDevice, Expression> {
                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };

+// Tiled evaluation strategy.
+#if !EIGEN_HAS_CXX11
+// To be able to use `TiledEvaluation::Off` in C++03 we need a namespace.
+// (Use of enumeration in a nested name specifier is a c++11 extension).
+namespace TiledEvaluation {
+#endif
+enum TiledEvaluation {
+  Off = 0,    // tiled evaluation is not supported
+  On = 1,     // still work in progress (see TensorBlockV2.h)
+  Legacy = 2  // soon to be deprecated (see TensorBock.h)
+};
+#if !EIGEN_HAS_CXX11
+}   // namespace TiledEvaluation
+#endif
+
 template <typename Device, typename Expression>
 struct IsTileable {
+#if !EIGEN_HAS_CXX11
+  typedef TiledEvaluation::TiledEvaluation TiledEvaluation;
+#endif
+
  // Check that block evaluation is supported and it's a preferred option (at
  // least one sub-expression has much faster block evaluation, e.g.
  // broadcasting).
-  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
-                            TensorEvaluator<Expression, Device>::PreferBlockAccess;
+  static const bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess &&
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const bool BlockAccessV2 =
+      TensorEvaluator<Expression, Device>::BlockAccessV2 &&
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+
+  static const TiledEvaluation value =
+      BlockAccessV2
+          ? TiledEvaluation::On
+          : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
 };

+#if EIGEN_HAS_CXX11
+template <typename Expression, typename Device,
+          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+#else
 template <typename Expression, typename Device,
          bool Vectorizable = IsVectorizable<Device, Expression>::value,
-          bool Tileable = IsTileable<Device, Expression>::value>
+          TiledEvaluation::TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
 class TensorExecutor;
+#endif

+// TODO(ezhulenev): Add TiledEvaluation support to async executor.
 template <typename Expression, typename Device, typename DoneCallback,
          bool Vectorizable = IsVectorizable<Device, Expression>::value,
-          bool Tileable = IsTileable<Device, Expression>::value>
+          bool Tileable     = IsTileable<Device, Expression>::BlockAccess>
 class TensorAsyncExecutor;

 }  // end namespace internal
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess       = true,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
@ -105,6 +106,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
      TensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      :  m_device(device), m_generator(op.generator())
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -232,6 +232,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = true,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,
@ -241,6 +242,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
      OutputTensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
      : m_device(device), m_impl(op.expression(), device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@ -93,12 +93,17 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_strides(op.strides())
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -120,12 +120,17 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
    CoordAccess = false,  // to be implemented
    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device)
  {
@ -195,11 +200,16 @@ template<typename ArgType, typename Device>
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
    CoordAccess = false  // to be implemented
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : Base(op, device)
  { }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -121,6 +121,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
                        TensorEvaluator<ArgType, Device>::RawAccess &&
                        NumInputDims > 0 && NumOutputDims > 0,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
@ -137,6 +138,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
                                      Layout>
      OutputTensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
  {
@ -363,6 +368,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -378,6 +384,10 @@ template<typename NewDimensions, typename ArgType, typename Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
  {
    return this->m_impl.coeffRef(index);
@ -532,6 +542,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,
@ -543,6 +554,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
  typedef typename TensorBlock::Dimensions TensorBlockDimensions;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
  {
@ -813,6 +828,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,
@ -824,6 +840,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
  typedef typename TensorBlock::Dimensions TensorBlockDimensions;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : Base(op, device)
    { }
@ -1002,11 +1022,16 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
    IsAligned = false,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device),
        m_device(device),
@ -1179,12 +1204,17 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
    IsAligned = false,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : Base(op, device)
    { }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -99,12 +99,17 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    IsAligned = true,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = true,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -97,12 +97,17 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
    RawAccess = false
 };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -585,6 +585,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
    IsAligned = false,
    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = true,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -598,6 +599,10 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
      InputTensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
  static const bool RunningFullReduction = (NumOutputDims==0);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@ -142,12 +142,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
      IsAligned = false,
      PacketAccess = false,
      BlockAccess = false,
+      BlockAccessV2 = false,
      PreferBlockAccess = false,
      Layout = PlainObjectType::Layout,
      CoordAccess = false,  // to be implemented
      RawAccess = false
    };

+    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+    typedef internal::TensorBlockNotImplemented TensorBlockV2;
+    //===------------------------------------------------------------------===//
+
    EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
    }

@ -374,12 +379,17 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
    IsAligned = false,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorRef<Derived>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
      : m_ref(m)
  { }
@ -423,10 +433,15 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
    IsAligned = false,
    PacketAccess = false,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
  { }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = true,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
@ -128,6 +129,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
      OutputTensorBlock;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
      : m_impl(op.expression(), device),
@ -400,6 +405,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
@ -414,6 +420,10 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+  
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const Dimensions& dimensions() const { return this->m_dimensions; }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -100,12 +100,17 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    IsAligned = false,
    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
    RawAccess = true
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
      : m_impl(op.expression(), device),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
@ -129,6 +130,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
      TensorBlockReader;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
      : m_device(device),
@ -426,6 +431,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2     = false,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess         = false
@ -438,6 +444,10 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
  typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
      TensorBlockWriter;

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : Base(op, device)
  { }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -115,12 +115,17 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@ -98,12 +98,17 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
  {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -184,12 +184,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
    IsAligned = false,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess = false,
+    BlockAccessV2 = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,
    RawAccess = false
  };

+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  //===--------------------------------------------------------------------===//
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
 m_impl(op.expression(), device)
  {
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -119,6 +119,7 @@ ei_add_test(cxx11_tensor_dimension)
 ei_add_test(cxx11_tensor_map)
 ei_add_test(cxx11_tensor_assign)
 ei_add_test(cxx11_tensor_block_access)
+ei_add_test(cxx11_tensor_broadcasting)
 ei_add_test(cxx11_tensor_comparisons)
 ei_add_test(cxx11_tensor_forced_eval)
 ei_add_test(cxx11_tensor_math)
@ -188,8 +189,9 @@ if(EIGEN_TEST_CXX11)
  ei_add_test(cxx11_tensor_of_complex)
  ei_add_test(cxx11_tensor_of_strings)
  ei_add_test(cxx11_tensor_lvalue)
-  ei_add_test(cxx11_tensor_broadcasting)
  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_block_eval)
+  ei_add_test(cxx11_tensor_block_io)
  ei_add_test(cxx11_tensor_concatenation)
  ei_add_test(cxx11_tensor_inflation)
  ei_add_test(cxx11_tensor_morphing)
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
    }
  }

+#if EIGEN_HAS_VARIADIC_TEMPLATES
  tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
  tensor.setRandom();
  broadcast = tensor.broadcast(broadcasts);

@ -139,7 +148,16 @@ static void test_static_broadcasting()
    }
  }

+#if EIGEN_HAS_VARIADIC_TEMPLATES
  tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
  tensor.setRandom();
  broadcast = tensor.broadcast(broadcasts);

--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@ -16,6 +16,7 @@
 using Eigen::Tensor;
 using Eigen::RowMajor;
 using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;

 // A set of tests to verify that different TensorExecutor strategies yields the
 // same results for all the ops, supporting tiled evaluation.
@ -30,7 +31,7 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_unary_expr(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -47,7 +48,7 @@ static void test_execute_unary_expr(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -58,7 +59,7 @@ static void test_execute_unary_expr(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_binary_expr(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -78,7 +79,7 @@ static void test_execute_binary_expr(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -89,7 +90,7 @@ static void test_execute_binary_expr(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_broadcasting(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -111,7 +112,7 @@ static void test_execute_broadcasting(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -121,7 +122,7 @@ static void test_execute_broadcasting(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_chipping_rvalue(Device d)
 {
  auto dims = RandomDims<NumDims>(1, 10);
@ -140,7 +141,7 @@ static void test_execute_chipping_rvalue(Device d)
                                                                          \
    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
    using Executor = internal::TensorExecutor<const Assign, Device,       \
-                                              Vectorizable, Tileable>;    \
+                                              Vectorizable, Tiling>;      \
                                                                          \
    Executor::run(Assign(dst, expr), d);                                  \
                                                                          \
@ -160,7 +161,7 @@ static void test_execute_chipping_rvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_chipping_lvalue(Device d)
 {
  auto dims = RandomDims<NumDims>(1, 10);
@ -193,7 +194,7 @@ static void test_execute_chipping_lvalue(Device d)
                                                                            \
    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
    using Executor = internal::TensorExecutor<const Assign, Device,         \
-                                              Vectorizable, Tileable>;      \
+                                              Vectorizable, Tiling>;        \
                                                                            \
    Executor::run(Assign(expr, src), d);                                    \
                                                                            \
@ -213,7 +214,7 @@ static void test_execute_chipping_lvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_shuffle_rvalue(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -239,7 +240,7 @@ static void test_execute_shuffle_rvalue(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -249,7 +250,7 @@ static void test_execute_shuffle_rvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_shuffle_lvalue(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -278,7 +279,7 @@ static void test_execute_shuffle_lvalue(Device d)

  using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(expr, src), d);

@ -288,7 +289,7 @@ static void test_execute_shuffle_lvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_reduction(Device d)
 {
  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -320,7 +321,7 @@ static void test_execute_reduction(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -330,7 +331,7 @@ static void test_execute_reduction(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_reshape(Device d)
 {
  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -360,7 +361,7 @@ static void test_execute_reshape(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -370,7 +371,7 @@ static void test_execute_reshape(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_execute_slice_rvalue(Device d)
 {
  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -400,7 +401,7 @@ static void test_execute_slice_rvalue(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -410,7 +411,7 @@ static void test_execute_slice_rvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_slice_lvalue(Device d)
 {
  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
@ -443,7 +444,7 @@ static void test_execute_slice_lvalue(Device d)

  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(expr, slice), d);

@ -453,7 +454,7 @@ static void test_execute_slice_lvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_broadcasting_of_forced_eval(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -475,7 +476,7 @@ static void test_execute_broadcasting_of_forced_eval(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -497,7 +498,7 @@ struct DummyGenerator {
 };

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_generator_op(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -518,7 +519,7 @@ static void test_execute_generator_op(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-    internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -528,7 +529,7 @@ static void test_execute_generator_op(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-    bool Tileable, int Layout>
+    TiledEvaluation Tiling, int Layout>
 static void test_execute_reverse_rvalue(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -553,7 +554,7 @@ static void test_execute_reverse_rvalue(Device d)

  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using Executor =
-    internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;

  Executor::run(Assign(dst, expr), d);

@ -563,7 +564,7 @@ static void test_execute_reverse_rvalue(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_async_execute_unary_expr(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -584,7 +585,7 @@ static void test_async_execute_unary_expr(Device d)
  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using DoneCallback = decltype(on_done);
  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
-                                                 Vectorizable, Tileable>;
+                                                 Vectorizable, Tiling>;

  Executor::runAsync(Assign(dst, expr), d, on_done);
  done.Wait();
@ -596,7 +597,7 @@ static void test_async_execute_unary_expr(Device d)
 }

 template <typename T, int NumDims, typename Device, bool Vectorizable,
-          bool Tileable, int Layout>
+          TiledEvaluation Tiling, int Layout>
 static void test_async_execute_binary_expr(Device d)
 {
  static constexpr int Options = 0 | Layout;
@ -620,7 +621,7 @@ static void test_async_execute_binary_expr(Device d)
  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
  using DoneCallback = decltype(on_done);
  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
-                                                 Vectorizable, Tileable>;
+                                                 Vectorizable, Tiling>;

  Executor::runAsync(Assign(dst, expr), d, on_done);
  done.Wait();
@ -640,34 +641,57 @@ static void test_async_execute_binary_expr(Device d)
 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART

-#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                              \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               false, ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               true,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               false, RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               true,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  RowMajor>(tp_device)))
+#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS)                                                                              \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))
+
+  // NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
+#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS)                                                                              \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,      ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,      ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,      RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,      RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))

 // NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
-#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                   \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  ColMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               false, RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               true,  RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, RowMajor>(tp_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  RowMajor>(tp_device)))
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Legacy,  RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Legacy,  RowMajor>(tp_device)))

 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::DefaultDevice default_device;
@ -678,69 +702,69 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::ThreadPool tp(num_threads);
  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);

-  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
-  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
-  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);

-  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
-  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
-  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);

-  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
-  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
-  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);

-  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 2);
-  CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 3);
-  CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 4);
-  CALL_SUBTEST_COMBINATIONS(8, test_execute_reduction, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);

-  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
-  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
-  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
-  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 5);

-  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);

-  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
-  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
-  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
-  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 5);

-  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
-  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
-  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
-  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 5);

-  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
-  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 5);

  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
@ -754,4 +778,3 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
 }

-#undef CALL_SUBTEST_COMBINATIONS