diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 7d0a21c3b..4a5fd9c79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -226,22 +226,18 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; - } else { - m_inputStride[0] = 1; - } + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; } m_dimensions = m_inputImpl.dimensions(); @@ -251,7 +247,6 @@ struct TensorEvaluator 0) { m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; } else { @@ -260,16 +255,12 @@ struct TensorEvaluator 0) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; - } else { - m_outputStride[0] = 1; - } + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; } } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -278,57 +269,126 @@ struct TensorEvaluator= 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - CoeffReturnType result = CoeffReturnType(0); - convolve(startInput, 0, 0, result); + convolve(firstInput(index), 0, NumKernelDims-1, result); return result; } - /* TODO: vectorization template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - assert(false); - }*/ + const int PacketSize = internal::unpacket_traits::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex < NumKernelDims-1) { - convolve(input, kernel, DimIndex+1, accum); - } else { - - accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); } } Scalar* data() const { return NULL; } private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + // No copy, no assignment TensorEvaluator(const TensorEvaluator&); TensorEvaluator& operator = (const TensorEvaluator&); @@ -341,6 +401,11 @@ struct TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index db716a80e..587cbd5ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,8 +108,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { @@ -134,6 +135,8 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + Scalar* data() const { return NULL; } + private: TensorEvaluator m_impl; const Device& m_device;