Improved the performance of the tensor convolution code by a factor of about 4.

2025-09-18 20:33:14 +08:00 · 2014-09-03 11:38:13 -07:00 · 2014-09-03 11:38:13 -07:00 · b24fe22b1a
commit b24fe22b1a
parent 2959045f2f
2 changed files with 111 additions and 43 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -226,22 +226,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  enum {
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = /*TensorEvaluator<InputArgType>::PacketAccess & TensorEvaluator<KernelArgType>::PacketAccess */
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
                   false,
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device)
  {
    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
+    m_inputStride[0] = 1;
-      if (i > 0) {
+    for (int i = 1; i < NumDims; ++i) {
-        m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
+      m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
      } else {
        m_inputStride[0] = 1;
      }
    }
    m_dimensions = m_inputImpl.dimensions();
@ -251,7 +247,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      const Index kernel_dim = kernel_dims[i];
      const Index result_dim = input_dim - kernel_dim + 1;
      m_dimensions[index] = result_dim;
      if (i > 0) {
        m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
      } else {
@ -260,16 +255,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      m_indexStride[i] = m_inputStride[index];
    }
-    for (int i = 0; i < NumDims; ++i) {
+    m_outputStride[0] = 1;
-      if (i > 0) {
+    for (int i = 1; i < NumDims; ++i) {
-        m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
+      m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
      } else {
        m_outputStride[0] = 1;
      }
    }
  }
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
@ -278,57 +269,126 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    m_kernelImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_inputImpl.cleanup();
-    m_kernelImpl.cleanup();
+    if (m_local_kernel) {
      m_device.deallocate((void*)m_kernel);
      m_local_kernel = false;
    }
    m_kernel = NULL;
  }
-  void evalTo(typename XprType::Scalar* buffer) const {
+  void evalTo(typename XprType::Scalar* buffer) {
    evalSubExprsIfNeeded(NULL);
    for (int i = 0; i < dimensions().TotalSize(); ++i) {
      buffer[i] += coeff(i);
    }
    cleanup();
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
    Index startInput = 0;
    for (int i = NumDims - 1; i >= 0; --i) {
      const Index idx = index / m_outputStride[i];
      startInput += idx * m_inputStride[i];
      index -= idx * m_outputStride[i];
    }
    CoeffReturnType result = CoeffReturnType(0);
-    convolve(startInput, 0, 0, result);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
    return result;
  }
  /* TODO: vectorization
  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
  {
-    assert(false);
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-  }*/
+    Index indices[2] = {index, index+PacketSize-1};
    Index startInputs[2] = {0, 0};
    for (int i = NumDims - 1; i > 0; --i) {
      const Index idx0 = indices[0] / m_outputStride[i];
      const Index idx1 = indices[1] / m_outputStride[i];
      startInputs[0] += idx0 * m_inputStride[i];
      startInputs[1] += idx1 * m_inputStride[i];
      indices[0] -= idx0 * m_outputStride[i];
      indices[1] -= idx1 * m_outputStride[i];
    }
    startInputs[0] += indices[0];
    startInputs[1] += indices[1];
-  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      return result;
-      if (DimIndex < NumKernelDims-1) {
+    } else {
-        convolve(input, kernel, DimIndex+1, accum);
+      EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
-      } else {
+      data[0] = Scalar(0);
-
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
-        accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel);
+      for (int i = 1; i < PacketSize-1; ++i) {
        data[i] = Scalar(0);
        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
      }
      data[PacketSize-1] = Scalar(0);
      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
      return internal::pload<PacketReturnType>(data);
    }
  }
  Scalar* data() const { return NULL; }
 private:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
    Index startInput = 0;
    for (int i = NumDims - 1; i > 0; --i) {
      const Index idx = index / m_outputStride[i];
      startInput += idx * m_inputStride[i];
      index -= idx * m_outputStride[i];
    }
    startInput += index;
    return startInput;
  }
  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
      const Index input = firstIndex + j * m_indexStride[DimIndex];
      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
      if (DimIndex > 0) {
        convolve(input, kernel, DimIndex-1, accum);
      } else {
        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
      }
    }
  }
  template <typename Packet>
  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
      const Index input = firstIndex + j * m_indexStride[DimIndex];
      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
      if (DimIndex > 0) {
        convolvePacket(input, kernel, DimIndex-1, accum);
      } else {
        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
      }
    }
  }
  EIGEN_STRONG_INLINE void preloadKernel() {
    // Don't make a local copy of the kernel unless we have to (i.e. it's an
    // expression that needs to be evaluated)
    const Scalar* in_place = m_kernelImpl.data();
    if (in_place) {
      m_kernel = in_place;
      m_local_kernel = false;
    } else {
      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
      EvalTo evalToTmp(local, m_kernelArg);
      internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
      m_kernel = local;
      m_local_kernel = true;
    }
  }
  // No copy, no assignment
  TensorEvaluator(const TensorEvaluator&);
  TensorEvaluator& operator = (const TensorEvaluator&);
@ -341,6 +401,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  TensorEvaluator<InputArgType, Device> m_inputImpl;
  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
  Dimensions m_dimensions;
  KernelArgType m_kernelArg;
  const Scalar* m_kernel;
  bool m_local_kernel;
  const Device& m_device;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -108,8 +108,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
-    m_impl.evalSubExprsIfNeeded();
+    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
@ -134,6 +135,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
  }
  Scalar* data() const { return NULL; }
 private:
  TensorEvaluator<ArgType, Device> m_impl;
  const Device& m_device;