eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H

namespace Eigen {
namespace internal {

// -------------------------------------------------------------------------- //
// Forward declarations for templates defined below.
template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIOV2;

// -------------------------------------------------------------------------- //
// Helper function to compute strides for densely stored buffer of given
// dimensions.

// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
// this function instead everywhere.
template <int Layout, typename IndexType, int NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
    const DSizes<IndexType, NumDims>& dimensions) {
  DSizes<IndexType, NumDims> strides;
  if (NumDims == 0) return strides;

  // TODO(ezhulenev): Use templates to unroll this loop (similar to
  // h_array_reduce in CXX11meta.h)? Benchmark it.
  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    strides[0] = 1;
    for (int i = 1; i < NumDims; ++i) {
      strides[i] = strides[i - 1] * dimensions[i - 1];
    }
  } else {
    strides[NumDims - 1] = 1;
    for (int i = NumDims - 2; i >= 0; --i) {
      strides[i] = strides[i + 1] * dimensions[i + 1];
    }
  }

  return strides;
}

template <int Layout, typename IndexType, size_t NumDims>
EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
    const Eigen::array<IndexType, NumDims>& dimensions) {
  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
}

#if EIGEN_HAS_CXX11
template <int Layout, std::ptrdiff_t... Indices>
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
    const Sizes<Indices...>& sizes) {
  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
}
#endif

// -------------------------------------------------------------------------- //
// TensorBlockDescriptor specifies a block offset within a tensor and the block
// sizes along each of the tensor dimensions.

template <int NumDims, typename IndexType = Eigen::Index>
class TensorBlockDescriptor {
 public:
  typedef DSizes<IndexType, NumDims> Dimensions;

  // If we evaluate a Tensor assignment, and expression on the left, already has
  // a memory buffer, then we might do performance optimization, and evaluate
  // the root expression directly into the memory, or maybe use it as temporary
  // storage for some of the subexpressions, to avoid dynamic memory allocation.
  //
  // This is a type erased storage, because passing Scalar type through all the
  // expression evaluation layers it way too many templates. Also it should be
  // possible to use this destination as a temp buffer for materializing
  // expressions with type, not matching the final output.
  class DestinationBuffer {
   public:
    template <typename Scalar>
    Scalar* data() const {
      return static_cast<Scalar*>(m_data);
    }

    template <typename Scalar>
    Dimensions dimensions() const {
      Dimensions dimensions;
      for (int i = 0; i < NumDims; ++i) {
        eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
        dimensions[i] = m_dimensions[i] / sizeof(Scalar);
      }
      return dimensions;
    }

    template <typename Scalar>
    Dimensions strides() const {
      Dimensions strides;
      for (int i = 0; i < NumDims; ++i) {
        eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
        strides[i] = m_strides[i] / sizeof(Scalar);
      }
      return strides;
    }

    // Returns true if the tensor block corresponding to `desc` fits into the
    // contiguous block of memory defined by `*this`.
    template <typename Scalar, int Layout>
    bool fitsContiguously(const TensorBlockDescriptor& desc) const {
      if (m_data == NULL) return false;

      const Dimensions& desc_dims = desc.dimensions();
      const Dimensions& dst_dims = dimensions<Scalar>();

      if (!dimensions_match(desc_dims, dst_dims)) return false;

      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
      const Dimensions& dst_strides = strides<Scalar>();

      // Compare strides ignoring dimensions of size `1`.
      for (int i = 0; i < NumDims; ++i) {
        if (desc_dims[i] == 1) continue;
        if (desc_strides[i] != dst_strides[i]) return false;
      }

      return true;
    }

   private:
    friend class TensorBlockDescriptor;

    DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}

    template <typename Scalar>
    DestinationBuffer(Scalar* data, const Dimensions& dimensions,
                      const Dimensions& strides, size_t total_dst_bytes)
        : m_data(static_cast<void*>(data)),
          m_dimensions(dimensions),
          m_strides(strides),
          m_total_dst_bytes(total_dst_bytes) {
      // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
      for (int i = 0; i < NumDims; ++i) {
        m_dimensions[i] *= sizeof(Scalar);
        m_strides[i] *= sizeof(Scalar);
      }
    }

    void* m_data;
    Dimensions m_dimensions;
    Dimensions m_strides;

    // Total size of the memory buffer at the destination (typically the total
    // size of the left hand side of an assignment expression). This can be the
    // same as `array_prod(m_dimensions)` if the assignment target has just a
    // single block, but typically it's a larger number.
    size_t m_total_dst_bytes;
  };

  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
                        const DestinationBuffer& destination)
      : m_offset(offset),
        m_dimensions(dimensions),
        m_destination(destination) {}

  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
      : m_offset(offset),
        m_dimensions(dimensions),
        m_destination(DestinationBuffer()) {}

  IndexType offset() const { return m_offset; }
  const Dimensions& dimensions() const { return m_dimensions; }
  IndexType dimension(int index) const { return m_dimensions[index]; }
  IndexType size() const { return array_prod<IndexType>(m_dimensions); }

  template <typename Scalar>
  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides,
                            size_t total_dst_bytes) {
    m_destination =
        DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
  }

  template <typename Scalar, typename DstStridesIndexType>
  void AddDestinationBuffer(
      Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
      size_t total_dst_bytes) {
    // DSizes constructor will do index type promotion if it's safe.
    AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
  }

  TensorBlockDescriptor& DropDestinationBuffer() {
    m_destination.m_data = NULL;
    return *this;
  }

  bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }

  const DestinationBuffer& GetDestinationBuffer() const {
    return m_destination;
  }

  // Returns a non-nullptr pointer to a destination buffer memory if this
  // block has a contiguous destination buffer.
  template <typename Scalar, int Layout>
  Scalar* destination() const {
    if (m_destination.template fitsContiguously<Scalar, Layout>(*this)) {
      return m_destination.template data<Scalar>();
    }
    return NULL;
  }

  // Returns a copy of `*this` with updated offset.
  TensorBlockDescriptor WithOffset(IndexType offset) const {
    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
  }

 private:
  // Offset and dimensions are immutable after construction. Block descriptor
  // can only be mutated by adding or dropping destination.
  const IndexType m_offset;
  const Dimensions m_dimensions;
  DestinationBuffer m_destination;
};

// -------------------------------------------------------------------------- //
// TensorBlockScratchAllocator is responsible for allocating temporary buffers
// for block evaluation (output or input block materialization). Given that
// Eigen expression traversal order is deterministic, all temporary allocations
// are happening in the same order, and usually have exactly the same size.
// Scratch allocator keeps a trace of all dynamic allocations, and after the
// first block evaluation is completed, we should be able to reuse all the
// temporary buffers for the next block evaluation.

template <typename Device>
class TensorBlockScratchAllocator {
 public:
  explicit TensorBlockScratchAllocator(const Device& device)
      : m_device(device), m_allocation_index(0) {}

  ~TensorBlockScratchAllocator() {
    for (size_t i = 0; i < m_allocations.size(); ++i) {
      m_device.deallocate(m_allocations[i].ptr);
    }
  }

  void* allocate(size_t size) {
    // TODO(ezhulenev): Remove when replaced with inlined vector.
    if (m_allocations.capacity() == 0) m_allocations.reserve(8);

    // Check if we already have an existing allocation att current index.
    const int num_allocations = static_cast<int>(m_allocations.size());
    const bool has_allocation = m_allocation_index < num_allocations;

    // Allocation index can't be larger than the number of allocations.
    eigen_assert(m_allocation_index <= num_allocations);

    // If we have existing allocation, and its size is larger or equal to
    // requested size, we do nothing.

    // If current allocation can't fit requested size, we deallocate it, and
    // replace with a larger allocation.
    if (has_allocation && m_allocations[m_allocation_index].size < size) {
      m_device.deallocate(m_allocations[m_allocation_index].ptr);
      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
      m_allocations[m_allocation_index].size = size;
    }

    // Make a new allocation if we don't have and existing one.
    if (!has_allocation) {
      Allocation allocation;
      allocation.ptr = m_device.allocate(size);
      allocation.size = size;
      m_allocations.push_back(allocation);
    }

    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
    eigen_assert(m_allocations[m_allocation_index].size >= size);

    return m_allocations[m_allocation_index++].ptr;
  }

  void reset() { m_allocation_index = 0; }

 private:
  struct Allocation {
    void* ptr;
    size_t size;
  };

  const Device& m_device;
  int m_allocation_index;
  // TODO(ezhulenev): This should be an inlined vector.
  std::vector<Allocation> m_allocations;
};

// -------------------------------------------------------------------------- //
// TensorBlockKind represents all possible block kinds, that can be produced by
// TensorEvaluator::evalBlock function.
#if !EIGEN_HAS_CXX11
// To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace.
// (Use of enumeration in a nested name specifier is a c++11 extension).
namespace TensorBlockKind {
#endif
enum TensorBlockKind {
  // Tensor block that is a lazy expression that must be assigned to a
  // destination using TensorBlockAssign.
  kExpr,

  // Tensor block that is a view into a memory buffer owned by an underlying
  // Tensor expression (e.g. it can be a view into a Tensor buffer).
  kView,

  // Tensor block that was materialized in a scratch memory buffer, allocated
  // with TensorBlockScratchAllocator. This block must be copied to a
  // destination, similar to a block of `kExpr` type.
  kMaterializedInScratch,

  // Tensor block that was materialized directly into the final output memory
  // buffer. For example if the left side of an assignment is a Tensor, we can
  // directly materialize the block in the destination memory.
  //
  // If strides in the output buffer do not match tensor block strides, the
  // Tensor expression will be invalid, and should not be used by
  // TensorBlockAssign or for constructing another block expression.
  kMaterializedInOutput
};
#if !EIGEN_HAS_CXX11
}  // namespace TensorBlockKind
#endif

// -------------------------------------------------------------------------- //
// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
// TensorEvaluators that do not support block evaluation.

class TensorBlockNotImplemented {
 public:
  typedef void XprType;
};

// -------------------------------------------------------------------------- //
// XprScalar extracts Scalar type from the Eigen expressions (if expression type
// is not void). It's required to be able to define lazy block expression for
// argument types, that do not support block evaluation.

template <typename XprType>
struct XprScalar {
  typedef typename XprType::Scalar type;
};
template <>
struct XprScalar<void> {
  typedef void type;
};

// -------------------------------------------------------------------------- //
// TensorMaterializedBlock is a fully evaluated block of the original tensor,
// and XprType is just a TensorMap over the data. This block type is typically
// used to materialize blocks of tensor expressions, that can't be efficiently
// represented as lazy Tensor expressions with fast coeff/packet operations,
// e.g. we materialize all broadcasts into evaluated blocks.
//
// TensorMaterializedBlock does not own its memory buffer, it's either a memory
// buffer that backs the original expression (e.g. block is just a view into a
// Tensor), or a memory buffer allocated with scratch allocator, and in this
// case the scratch allocator will deallocate it at the end of block based
// expression execution.
//
// If the block was evaluated directly into the output buffer, and strides in
// the output buffer do not match block strides, the TensorMap expression will
// be invalid, and should never be used in block assignment or any other tensor
// expression.

template <typename Scalar, int NumDims, int Layout,
          typename IndexType = Eigen::Index>
class TensorMaterializedBlock {
#if !EIGEN_HAS_CXX11
  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif
 public:
  typedef DSizes<IndexType, NumDims> Dimensions;
  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;

  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
                          const Dimensions& dimensions, bool valid_expr = true)
      : m_kind(kind),
        m_data(data),
        m_dimensions(dimensions),
        m_expr(m_data, m_dimensions),
        m_valid_expr(valid_expr) {
    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
  }

  TensorBlockKind kind() const { return m_kind; }
  // NOTE(ezhulenev): Returning XprType by value like in other block types
  // causes asan failures. The theory is that XprType::Nested doesn't work
  // properly for TensorMap.
  const XprType& expr() const {
    eigen_assert(m_valid_expr);
    return m_expr;
  }
  const Scalar* data() const { return m_data; }
  void cleanup() {}

  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;

  // Creates a materialized block for the given descriptor from a memory buffer.
  template <typename DataDimensions, typename TensorBlockScratch>
  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
      const Scalar* data, const DataDimensions& data_dims,
      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());

    // If a tensor block dimensions covers a contiguous block of the underlying
    // memory, we can skip block buffer memory allocation, and construct a block
    // from existing `data` memory buffer.
    //
    // Example: (RowMajor layout)
    //   data_dims:          [11, 12, 13, 14]
    //   desc.dimensions():  [1,   1,  3, 14]
    //
    // In this case we can construct a TensorBlock starting at
    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
    static const bool is_col_major = Layout == ColMajor;

    // Find out how many inner dimensions have a matching size.
    int num_matching_inner_dims = 0;
    for (int i = 0; i < NumDims; ++i) {
      int dim = is_col_major ? i : NumDims - i - 1;
      if (data_dims[dim] != desc.dimensions()[dim]) break;
      ++num_matching_inner_dims;
    }

    // All the outer dimensions must be of size `1`, except a single dimension
    // before the matching inner dimension (`3` in the example above).
    bool can_use_direct_access = true;
    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
      int dim = is_col_major ? i : NumDims - i - 1;
      if (desc.dimension(dim) != 1) {
        can_use_direct_access = false;
        break;
      }
    }

    if (can_use_direct_access) {
      const Scalar* block_start = data + desc.offset();
      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
                                     block_start, desc.dimensions());

    } else {
      // Try to reuse destination as an output block buffer.
      Scalar* block_buffer = desc.template destination<Scalar, Layout>();
      bool materialized_in_output;

      if (block_buffer != NULL) {
        desc.DropDestinationBuffer();
        materialized_in_output = true;

      } else {
        materialized_in_output = false;
        void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
        block_buffer = static_cast<Scalar*>(mem);
      }

      typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
          TensorBlockIO;
      typedef typename TensorBlockIO::Dst TensorBlockIODst;
      typedef typename TensorBlockIO::Src TensorBlockIOSrc;

      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
                           data, desc.offset());
      TensorBlockIODst dst(desc.dimensions(),
                           internal::strides<Layout>(desc.dimensions()),
                           block_buffer);

      TensorBlockIO::Copy(dst, src);

      return TensorMaterializedBlock(
          materialized_in_output
              ? internal::TensorBlockKind::kMaterializedInOutput
              : internal::TensorBlockKind::kMaterializedInScratch,
          block_buffer, desc.dimensions());
    }
  }

 private:
  TensorBlockKind m_kind;
  const Scalar* m_data;
  Dimensions m_dimensions;
  XprType m_expr;
  bool m_valid_expr;
};

// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
// functor to the blocks produced by the underlying Tensor expression.

template <typename UnaryOp, typename ArgTensorBlock>
class TensorCwiseUnaryBlock {
#if !EIGEN_HAS_CXX11
  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif

  static const bool NoArgBlockAccess =
      internal::is_void<typename ArgTensorBlock::XprType>::value;

 public:
  typedef typename conditional<
      NoArgBlockAccess, void,
      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
      type XprType;

  typedef typename XprScalar<XprType>::type Scalar;

  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
      : m_arg_block(arg_block), m_functor(functor) {}

  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }

  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
  const Scalar* data() const { return NULL; }
  void cleanup() { m_arg_block.cleanup(); }

 private:
  ArgTensorBlock m_arg_block;
  UnaryOp m_functor;
};

// -------------------------------------------------------------------------- //
// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
// functor to the blocks produced by the underlying Tensor expression.

template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
class TensorCwiseBinaryBlock {
#if !EIGEN_HAS_CXX11
  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif

  static const bool NoArgBlockAccess =
      internal::is_void<typename LhsTensorBlock::XprType>::value ||
      internal::is_void<typename RhsTensorBlock::XprType>::value;

 public:
  typedef typename conditional<
      NoArgBlockAccess, void,
      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
                          const typename RhsTensorBlock::XprType> >::type
      XprType;

  typedef typename XprScalar<XprType>::type Scalar;

  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
                         const RhsTensorBlock& right_block,
                         const BinaryOp& functor)
      : m_left_block(left_block),
        m_right_block(right_block),
        m_functor(functor) {}

  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }

  XprType expr() const {
    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
  }

  const Scalar* data() const { return NULL; }

  void cleanup() {
    m_left_block.cleanup();
    m_right_block.cleanup();
  }

 private:
  LhsTensorBlock m_left_block;
  RhsTensorBlock m_right_block;
  BinaryOp m_functor;
};

// -------------------------------------------------------------------------- //
// TensorUnaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from a block of the underlying type (this is a
// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).

template <typename BlockFactory, typename ArgTensorBlock>
class TensorUnaryExprBlock {
#if !EIGEN_HAS_CXX11
  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif

  typedef typename ArgTensorBlock::XprType ArgXprType;
  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;

 public:
  typedef typename conditional<
      NoArgBlockAccess, void,
      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;

  typedef typename XprScalar<XprType>::type Scalar;

  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
                       const BlockFactory& factory)
      : m_arg_block(arg_block), m_factory(factory) {}

  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
  const Scalar* data() const { return NULL; }
  void cleanup() { m_arg_block.cleanup(); }

 private:
  ArgTensorBlock m_arg_block;
  BlockFactory m_factory;
};

// -------------------------------------------------------------------------- //
// TensorTernaryExprBlock is a lazy tensor expression block that can construct
// an arbitrary tensor expression from three blocks of the underlying type.

template <typename BlockFactory, typename Arg1TensorBlock,
          typename Arg2TensorBlock, typename Arg3TensorBlock>
class TensorTernaryExprBlock {
#if !EIGEN_HAS_CXX11
  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif

  typedef typename Arg1TensorBlock::XprType Arg1XprType;
  typedef typename Arg2TensorBlock::XprType Arg2XprType;
  typedef typename Arg3TensorBlock::XprType Arg3XprType;

  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
                                       internal::is_void<Arg2XprType>::value ||
                                       internal::is_void<Arg3XprType>::value;

 public:
  typedef typename conditional<
      NoArgBlockAccess, void,
      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
                                              Arg3XprType>::type>::type XprType;

  typedef typename XprScalar<XprType>::type Scalar;

  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
                         const Arg2TensorBlock& arg2_block,
                         const Arg3TensorBlock& arg3_block,
                         const BlockFactory& factory)
      : m_arg1_block(arg1_block),
        m_arg2_block(arg2_block),
        m_arg3_block(arg3_block),
        m_factory(factory) {}

  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
  XprType expr() const {
    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
                          m_arg3_block.expr());
  }
  const Scalar* data() const { return NULL; }
  void cleanup() {
    m_arg1_block.cleanup();
    m_arg2_block.cleanup();
    m_arg3_block.cleanup();
  }

 private:
  Arg1TensorBlock m_arg1_block;
  Arg2TensorBlock m_arg2_block;
  Arg3TensorBlock m_arg3_block;
  BlockFactory m_factory;
};

// -------------------------------------------------------------------------- //
// StridedLinearBufferCopy provides a method to copy data between two linear
// buffers with different strides, with optimized paths for scatter/gather.

template <typename Scalar, typename IndexType>
class StridedLinearBufferCopy {
  typedef typename packet_traits<Scalar>::type Packet;
  enum {
    Vectorizable = packet_traits<Scalar>::Vectorizable,
    PacketSize = packet_traits<Scalar>::size
  };

 public:
  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
  enum Kind {
    Linear = 0,       // src_stride == 1 && dst_stride == 1
    Scatter = 1,      // src_stride == 1 && dst_stride != 1
    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
    Gather = 4,       // dst_stride == 1
    Random = 5        // everything else
  };

  struct Dst {
    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}

    IndexType offset;
    IndexType stride;
    Scalar* data;
  };

  struct Src {
    Src(IndexType o, IndexType s, const Scalar* d)
        : offset(o), stride(s), data(d) {}

    IndexType offset;
    IndexType stride;
    const Scalar* data;
  };

  template <StridedLinearBufferCopy::Kind kind>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
                                                        const Src& src,
                                                        const size_t count) {
    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
              src.data);
  }

 private:
  template <StridedLinearBufferCopy::Kind kind>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
      const IndexType count, const IndexType dst_offset,
      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
      const IndexType src_offset, const IndexType src_stride,
      const Scalar* EIGEN_RESTRICT src_data) {
    const Scalar* src = &src_data[src_offset];
    Scalar* dst = &dst_data[dst_offset];

    if (!Vectorizable) {
      for (Index i = 0; i < count; ++i) {
        dst[i * dst_stride] = src[i * src_stride];
      }
      return;
    }

    const IndexType vectorized_size = count - PacketSize;
    IndexType i = 0;

    if (kind == Linear) {
      // ******************************************************************** //
      // Linear copy from `src` to `dst`.
      const IndexType unrolled_size = count - 4 * PacketSize;
      eigen_assert(src_stride == 1 && dst_stride == 1);
      for (; i <= unrolled_size; i += 4 * PacketSize) {
        for (int j = 0; j < 4; ++j) {
          Packet p = ploadu<Packet>(src + i + j * PacketSize);
          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
        }
      }
      for (; i <= vectorized_size; i += PacketSize) {
        Packet p = ploadu<Packet>(src + i);
        pstoreu<Scalar, Packet>(dst + i, p);
      }
      for (; i < count; ++i) {
        dst[i] = src[i];
      }
      // ******************************************************************** //
    } else if (kind == Scatter) {
      // Scatter from `src` to `dst`.
      eigen_assert(src_stride == 1 && dst_stride != 1);
      for (; i <= vectorized_size; i += PacketSize) {
        Packet p = ploadu<Packet>(src + i);
        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
      }
      for (; i < count; ++i) {
        dst[i * dst_stride] = src[i];
      }
      // ******************************************************************** //
    } else if (kind == FillLinear) {
      // Fill `dst` with value at `*src`.
      eigen_assert(src_stride == 0 && dst_stride == 1);
      const IndexType unrolled_size = count - 4 * PacketSize;
      Packet p = pload1<Packet>(src);
      for (; i <= unrolled_size; i += 4 * PacketSize) {
        for (int j = 0; j < 4; ++j) {
          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
        }
      }
      for (; i <= vectorized_size; i += PacketSize) {
        pstoreu<Scalar, Packet>(dst + i, p);
      }
      for (; i < count; ++i) {
        dst[i] = *src;
      }
      // ******************************************************************** //
    } else if (kind == FillScatter) {
      // Scatter `*src` into `dst`.
      eigen_assert(src_stride == 0 && dst_stride != 1);
      Packet p = pload1<Packet>(src);
      for (; i <= vectorized_size; i += PacketSize) {
        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
      }
      for (; i < count; ++i) {
        dst[i * dst_stride] = *src;
      }
      // ******************************************************************** //
    } else if (kind == Gather) {
      // Gather from `src` into `dst`.
      eigen_assert(dst_stride == 1);
      for (; i <= vectorized_size; i += PacketSize) {
        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
        pstoreu<Scalar, Packet>(dst + i, p);
      }
      for (; i < count; ++i) {
        dst[i] = src[i * src_stride];
      }
      // ******************************************************************** //
    } else if (kind == Random) {
      // Random.
      for (; i < count; ++i) {
        dst[i * dst_stride] = src[i * src_stride];
      }
    } else {
      eigen_assert(false);
    }
  }
};

// -------------------------------------------------------------------------- //
// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
// It's possible to specify src->dst dimension mapping for the copy operation.
// Dimensions of `dst` specify how many elements have to be copied, for the
// `src` we need to know only stride to navigate through source memory buffer.

template <typename Scalar, typename IndexType, int NumDims, int Layout>
class TensorBlockIOV2 {
  static const bool IsColMajor = (Layout == ColMajor);

  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;

 public:
  typedef DSizes<IndexType, NumDims> Dimensions;
  typedef DSizes<int, NumDims> DimensionsMap;

  struct Dst {
    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
        IndexType dst_offset = 0)
        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}

    Dimensions dims;
    Dimensions strides;
    Scalar* data;
    IndexType offset;
  };

  struct Src {
    Src(const Dimensions& src_strides, const Scalar* src,
        IndexType src_offset = 0)
        : strides(src_strides), data(src), offset(src_offset) {}

    Dimensions strides;
    const Scalar* data;
    IndexType offset;
  };

  // Copies data to `dst` from `src`, using provided dimensions mapping:
  //
  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
  //
  // Returns the number of copied elements.
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
    // Copy single scalar value from `src` to `dst`.
    if (NumDims == 0) {
      *(dst.data + dst.offset) = *(src.data + src.offset);
      return 1;
    }

    // Both `dst` and `src` must have contiguous innermost dimension. We also
    // accept the special case with stride '0', because it's used as a trick to
    // implement broadcasting.
    {
      int inner_dim = IsColMajor ? 0 : NumDims - 1;
      EIGEN_UNUSED_VARIABLE(inner_dim);
      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
    }

    // Give a shorter name to `dst_to_src_dim_map`.
    const DimensionsMap& dim_map = dst_to_src_dim_map;

    // Do not squeeze reordered inner dimensions.
    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);

    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
    // block, and we write data linearly into that dimension, reading it from
    // the src. If dimensions are reordered, we might end up reading data from
    // the src with `stride != 1`.
    //
    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680

    // Find the innermost dimension in the dst whose size is not 1. This is the
    // effective inner dim.
    int num_size_one_inner_dims = 0;
    for (int i = 0; i < num_squeezable_dims; ++i) {
      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
      if (dst.dims[dst_dim] != 1) break;
      num_size_one_inner_dims++;
    }

    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
    if (num_size_one_inner_dims == NumDims) {
      *(dst.data + dst.offset) = *(src.data + src.offset);
      return 1;
    }

    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
    const int dst_stride1_dim = IsColMajor
                                    ? num_size_one_inner_dims
                                    : NumDims - num_size_one_inner_dims - 1;

    // Dimension in the src that corresponds to the dst innermost dimension.
    const int src_dim_for_dst_stride1_dim =
        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];

    // Size of the innermost dimension (length of contiguous blocks of memory).
    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];

    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
    // `src` memory, so we can do less linear copy calls.
    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
      const IndexType dst_stride = dst.strides[dst_dim];
      const IndexType src_stride = src.strides[dim_map[dst_dim]];
      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
        dst_inner_dim_size *= dst.dims[dst_dim];
        ++num_size_one_inner_dims;
      } else {
        break;
      }
    }

    // Setup strides to read data from `src` and write to `dst`.
    IndexType input_offset = src.offset;
    IndexType output_offset = dst.offset;
    IndexType input_stride =
        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];

    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
    array<BlockIteratorState, at_least_1_dim> it;

    // Initialize block iterator state. Squeeze away any dimension of size 1.
    int idx = 0;  // currently initialized iterator state index
    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
      if (dst.dims[dst_dim] == 1) continue;

      it[idx].size = dst.dims[dst_dim];
      it[idx].input_stride = src.strides[dim_map[dst_dim]];
      it[idx].output_stride = dst.strides[dst_dim];

      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);

      idx++;
    }

    // Iterate copying data from src to dst.
    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();

#define COPY_INNER_DIM(KIND)                                           \
  IndexType num_copied = 0;                                            \
  for (num_copied = 0; num_copied < block_total_size;                  \
       num_copied += dst_inner_dim_size) {                             \
    LinCopy::template Run<KIND>(                                       \
        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
        typename LinCopy::Src(input_offset, input_stride, src.data),   \
        dst_inner_dim_size);                                           \
                                                                       \
    for (int j = 0; j < idx; ++j) {                                    \
      if (++it[j].count < it[j].size) {                                \
        input_offset += it[j].input_stride;                            \
        output_offset += it[j].output_stride;                          \
        break;                                                         \
      }                                                                \
      it[j].count = 0;                                                 \
      input_offset -= it[j].input_span;                                \
      output_offset -= it[j].output_span;                              \
    }                                                                  \
  }                                                                    \
  return num_copied;

    if (input_stride == 1 && output_stride == 1) {
      COPY_INNER_DIM(LinCopy::Linear);
    } else if (input_stride == 1 && output_stride != 1) {
      COPY_INNER_DIM(LinCopy::Scatter);
    } else if (input_stride == 0 && output_stride == 1) {
      COPY_INNER_DIM(LinCopy::FillLinear);
    } else if (input_stride == 0 && output_stride != 1) {
      COPY_INNER_DIM(LinCopy::FillScatter);
    } else if (output_stride == 1) {
      COPY_INNER_DIM(LinCopy::Gather);
    } else {
      COPY_INNER_DIM(LinCopy::Random);
    }

#undef COPY_INNER_DIM
  }

  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
  // the number of copied elements.
  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
                                                              const Src& src) {
    DimensionsMap dst_to_src_map;
    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
    return Copy(dst, src, dst_to_src_map);
  }

 private:
  struct BlockIteratorState {
    BlockIteratorState()
        : size(0),
          count(0),
          input_stride(0),
          output_stride(0),
          input_span(0),
          output_span(0) {}

    IndexType size;
    IndexType count;
    IndexType input_stride;
    IndexType output_stride;
    IndexType input_span;
    IndexType output_span;
  };

  // Compute how many inner dimensions it's allowed to squeeze when doing IO
  // between two tensor blocks. It's safe to squeeze inner dimensions, only
  // if they are not reordered.
  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
    int num_squeezable_dims = 0;
    for (int i = 0; i < NumDims; ++i) {
      const int dim = IsColMajor ? i : NumDims - i - 1;
      if (dim_map[dim] != dim) break;
      num_squeezable_dims++;
    }
    return num_squeezable_dims;
  }
};

// -------------------------------------------------------------------------- //
// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
//
// Currently there is no way to write from a Tensor expression to a block of
// memory, if dimensions are reordered. If you need to do that, you should
// materialize a Tensor block expression into a memory buffer, and then use
// TensorBlockIO to copy data between two memory buffers with a custom
// `target->src` dimension map (see definition above).
//
// Also currently the innermost dimension of `target` must have a stride '1'
// (contiguous in memory). This restriction could be lifted with a `pscatter`,
// but in practice it's never needed, and there is a similar TensorBlockIO
// workaround for that.
//
// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
// where `src` is a tensor expression. Explore if it is possible to rewrite IO
// to use expressions instead of pointers, and after that TensorBlockAssignment
// will become an alias to IO.
template <typename Scalar, int NumDims, typename TensorBlockExpr,
          typename IndexType = Eigen::Index>
class TensorBlockAssignment {
  // We will use coeff/packet path to evaluate block expressions.
  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
      TensorBlockEvaluator;

  typedef DSizes<IndexType, NumDims> Dimensions;

  enum {
    Vectorizable = packet_traits<Scalar>::Vectorizable,
    PacketSize = packet_traits<Scalar>::size
  };

  template <bool Vectorizable, typename Evaluator>
  struct InnerDimAssign {
    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
                                        const Evaluator& eval,
                                        IndexType eval_offset) {
      for (IndexType i = 0; i < count; ++i) {
        target[i] = eval.coeff(eval_offset + i);
      }
    }
  };

  template <typename Evaluator>
  struct InnerDimAssign<true, Evaluator> {
    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
                                        const Evaluator& eval,
                                        IndexType eval_offset) {
      typedef typename packet_traits<Scalar>::type Packet;

      const IndexType unrolled_size = count - 4 * PacketSize;
      const IndexType vectorized_size = count - PacketSize;
      IndexType i = 0;

      for (; i <= unrolled_size; i += 4 * PacketSize) {
        for (int j = 0; j < 4; ++j) {
          const IndexType idx = eval_offset + i + j * PacketSize;
          Packet p = eval.template packet<Unaligned>(idx);
          pstoreu<Scalar>(target + i + j * PacketSize, p);
        }
      }

      for (; i <= vectorized_size; i += PacketSize) {
        Packet p = eval.template packet<Unaligned>(eval_offset + i);
        pstoreu<Scalar>(target + i, p);
      }

      for (; i < count; ++i) {
        target[i] = eval.coeff(eval_offset + i);
      }
    }
  };

 public:
  struct Target {
    Target(const Dimensions& target_dims, const Dimensions& target_strides,
           Scalar* target_data, IndexType target_offset = 0)
        : dims(target_dims),
          strides(target_strides),
          data(target_data),
          offset(target_offset) {}

    Dimensions dims;
    Dimensions strides;
    Scalar* data;
    IndexType offset;
  };

  static Target target(const Dimensions& target_dims,
                       const Dimensions& target_strides, Scalar* target_data,
                       IndexType target_offset = 0) {
    return Target(target_dims, target_strides, target_data, target_offset);
  }

  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
  static Target target(
      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
      Scalar* target_data, IndexType target_offset = 0) {
    // DSizes constructor will do index type promotion if it's safe.
    return Target(Dimensions(target_dims), Dimensions(target_strides),
                  target_data, target_offset);
  }

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
      const Target& target, const TensorBlockExpr& expr) {
    // Prepare evaluator for block expression.
    DefaultDevice default_device;
    TensorBlockEvaluator eval(expr, default_device);

    // Tensor block expression dimension should match destination dimensions.
    eigen_assert(dimensions_match(target.dims, eval.dimensions()));

    static const int Layout = TensorBlockEvaluator::Layout;
    static const bool is_col_major = Layout == ColMajor;

    // Initialize output inner dimension size based on a layout.
    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
    IndexType output_inner_dim_size = target.dims[inner_dim_idx];

    // Target inner dimension stride must be '1'.
    eigen_assert(target.strides[inner_dim_idx] == 1);

    // Squeeze multiple inner dims into one if they are contiguous in `target`.
    IndexType num_squeezed_dims = 0;
    for (Index i = 1; i < NumDims; ++i) {
      const Index dim = is_col_major ? i : NumDims - i - 1;
      const IndexType target_stride = target.strides[dim];

      if (output_inner_dim_size == target_stride) {
        output_inner_dim_size *= target.dims[dim];
        num_squeezed_dims++;
      } else {
        break;
      }
    }

    // Initialize output block iterator state. Dimension in this array are
    // always in inner_most -> outer_most order (col major layout).
    array<BlockIteratorState, NumDims> it;

    int idx = 0;  // currently initialized iterator state index
    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;

      it[idx].count = 0;
      it[idx].size = target.dims[dim];
      it[idx].output_stride = target.strides[dim];
      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
      idx++;
    }

    // We read block expression from the beginning, and start writing data to
    // `target` at given offset.
    IndexType input_offset = 0;
    IndexType output_offset = target.offset;

    // Iterate copying data from `eval` to `target`.
    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
      // Assign to `target` at current offset.
      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
                     TensorBlockEvaluator>::Run(target.data + output_offset,
                                                output_inner_dim_size, eval,
                                                input_offset);

      // Move input offset forward by the number of assigned coefficients.
      input_offset += output_inner_dim_size;

      // Update index.
      for (int j = 0; j < idx; ++j) {
        if (++it[j].count < it[j].size) {
          output_offset += it[j].output_stride;
          break;
        }
        it[j].count = 0;
        output_offset -= it[j].output_span;
      }
    }
  }

 private:
  struct BlockIteratorState {
    BlockIteratorState()
        : count(0), size(0), output_stride(0), output_span(0) {}

    IndexType count;
    IndexType size;
    IndexType output_stride;
    IndexType output_span;
  };
};

// -------------------------------------------------------------------------- //

}  // namespace internal
}  // namespace Eigen

#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H