From 0ab1f8ec03a920b01d19061ca97eaf05b81f9fb1 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Thu, 21 Oct 2021 19:57:00 -0700
Subject: [PATCH] Fix broadcasting oob error.

For vectorized 1-dimensional inputs that do not take the special
blocking path (e.g. `std::complex<...>`), there was an
index-out-of-bounds error causing the broadcast size to be
computed incorrectly.  Here we fix this, and make other minor
cleanup changes.

Fixes #2351.


(cherry picked from commit a500da1dc089b08e2f2b3b05a2eb23194425460e)
---
 .../CXX11/src/Tensor/TensorBroadcasting.h     | 59 +++++++++----------
 .../test/cxx11_tensor_broadcasting.cpp        | 18 ++++++
 2 files changed, 46 insertions(+), 31 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a354132f6..8d8ad2658 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -127,7 +127,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
   typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
@@ -144,7 +144,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   {
 
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
-    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
+    // and store the result in a scalar. Instead one should reshape the scalar into a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
@@ -410,25 +410,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
   {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Concatenates m_broadcast[dim] copies,
+    //    [v0, ..., vN, v0, ..., vN, ... ]
+    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    Index dim, inputIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = NumDims - 1;
-    } else {
-      dim = 0;
-    }
-
-    inputIndex = index % m_inputStrides[dim];
-    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+    // Size of flattened tensor.
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_inputStrides[NumDims - 1] : m_inputStrides[0];
+    Index inputIndex = index % M;
+    if (inputIndex + PacketSize <= M) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
-        if (inputIndex > m_inputStrides[dim]-1) {
+        if (inputIndex > M - 1) {
           inputIndex = 0;
         }
         values[i] = m_impl.coeff(inputIndex++);
@@ -440,32 +439,30 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
   {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Interleaves m_broadcast[dim] copies,
+    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
+    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+    eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index dim, inputIndex, outputOffset;
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_broadcast[0] : m_broadcast[NumDims - 1];
 
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = 1;
-    } else {
-      dim = NumDims - 2;
-    }
-
-    inputIndex   = index / m_outputStrides[dim];
-    outputOffset = index % m_outputStrides[dim];
-    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
+    Index inputIndex   = index / M;
+    Index outputOffset = index % M;
+    if (outputOffset + PacketSize <= M) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
     } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[dim]) {
+      for (int i = 0; i < PacketSize; ++i) {
+        if (outputOffset < M) {
           values[i] = m_impl.coeff(inputIndex);
+          ++outputOffset;
         } else {
-          values[i] = m_impl.coeff(++inputIndex);
           outputOffset = 0;
-          cur = 0;
+          values[i] = m_impl.coeff(++inputIndex);
         }
       }
       return internal::pload<PacketReturnType>(values);
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index d3dab891f..cbd92c328 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -256,6 +256,22 @@ static void test_simple_broadcasting_n_by_one()
   }
 }
 
+template <int DataLayout>
+static void test_size_one_broadcasting()
+{
+  Tensor<float, 1, DataLayout> tensor(1);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> broadcasts = {64};
+  Tensor<float, 1, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), broadcasts[0]);
+
+  for (int i = 0; i < broadcasts[0]; ++i) {
+    VERIFY_IS_EQUAL(tensor(0), broadcast(i));
+  }
+}
+
 template <int DataLayout>
 static void test_simple_broadcasting_one_by_n_by_one_1d()
 {
@@ -328,4 +344,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<RowMajor>());
 }