Add block evaluation to TensorEvalTo and fix few small bugs

2025-08-12 03:39:01 +08:00 · 2019-10-07 15:34:26 -07:00 · 2019-10-07 15:34:26 -07:00 · f74ab8cb8d
commit f74ab8cb8d
parent 3afb640b56
4 changed files with 95 additions and 33 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -111,22 +111,28 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    BlockAccess       = true,
-    BlockAccessV2     = false,
+    BlockAccessV2     = true,
    PreferBlockAccess = false,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,  // to be implemented
    RawAccess         = true
  };

-  typedef typename internal::TensorBlock<
-      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
-      TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
-      TensorBlockReader;
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> TensorBlock;
+  typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout> TensorBlockReader;

  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<
+      Scalar, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
  //===--------------------------------------------------------------------===//

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@ -164,6 +170,30 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
    m_impl.block(&eval_to_block);
  }

+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.AddDestinationBuffer(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()),
+        /*total_dst_bytes=*/
+                     (internal::array_prod(m_impl.dimensions())
+                         * sizeof(Scalar)));
+
+    ArgTensorBlock block = m_impl.blockV2(desc, scratch);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
+              m_buffer, desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_impl.cleanup();
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -238,7 +238,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
    typedef typename TensorBlock::Dimensions TensorBlockDimensions;

-    typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
+        TensorBlockDesc;
    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
        TensorBlockScratch;

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -231,7 +231,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
-    eigen_assert(m_impl.data() != NULL);
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
+                           desc.dimensions());
+    }

    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
    ScalarNoConst* materialized_output =
@ -385,6 +389,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
          const Index out = output_offset + output_inner_pad_before_size;
          const Index in = input_offset + output_inner_pad_before_size;

+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
          LinCopy::template Run<LinCopy::Kind::Linear>(
              typename LinCopy::Dst(out, 1, materialized_output),
              typename LinCopy::Src(in, 1, m_impl.data()),
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@ -131,6 +131,7 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {

  // TensorEvaluator is needed to produce tensor blocks of the expression.
  auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+  eval.evalSubExprsIfNeeded(nullptr);

  // Choose a random offsets, sizes and TensorBlockDescriptor.
  TensorBlockParams<NumDims> block_params = gen_block();
@ -266,29 +267,6 @@ static void test_eval_tensor_reshape() {
      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
 }

-template <typename T, int Layout>
-static void test_eval_tensor_reshape_with_bcast() {
-  Index dim = internal::random<Index>(1, 100);
-
-  Tensor<T, 2, Layout> lhs(1, dim);
-  Tensor<T, 2, Layout> rhs(dim, 1);
-  lhs.setRandom();
-  rhs.setRandom();
-
-  auto reshapeLhs = NByOne(dim);
-  auto reshapeRhs = OneByM(dim);
-
-  auto bcastLhs = OneByM(dim);
-  auto bcastRhs = NByOne(dim);
-
-  DSizes<Index, 2> dims(dim, dim);
-
-  VerifyBlockEvaluator<T, 2, Layout>(
-      lhs.reshape(reshapeLhs).broadcast(bcastLhs) +
-          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
-      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
-}
-
 template <typename T, int NumDims, int Layout>
 static void test_eval_tensor_cast() {
  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
@ -355,6 +333,52 @@ static void test_eval_tensor_padding() {
      [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
 }

+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(1, dim);
+  Tensor<T, 2, Layout> rhs(dim, 1);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto reshapeLhs = NByOne(dim);
+  auto reshapeRhs = OneByM(dim);
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) +
+          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(dim, 1);
+  Tensor<T, 2, Layout> rhs(1, dim);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) + rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) + rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
 // -------------------------------------------------------------------------- //
 // Verify that assigning block to a Tensor expression produces the same result
 // as an assignment to TensorSliceOp (writing a block is is identical to
@ -482,6 +506,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);

  CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);

  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);