mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-29 15:24:51 +08:00
Merged in rmlarsen/eigen (pull request PR-177)
Eigen Tensor cost model part 1.
This commit is contained in:
commit
6fbedf5a4e
@ -26,6 +26,7 @@
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
|
||||
@ -80,6 +81,7 @@ typedef unsigned __int64 uint64_t;
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
|
||||
#include "src/Tensor/TensorCostModel.h"
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
#include "src/Tensor/TensorReduction.h"
|
||||
|
@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
|
||||
return CoeffReturnType(index, m_impl.coeff(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -89,6 +89,12 @@ template<typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// The dimensions of the lhs and the rhs tensors should be equal to prevent
|
||||
@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
return m_leftImpl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here, but reduce left
|
||||
// cost by one load because we are using m_leftImpl.coeffRef.
|
||||
TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
|
||||
return m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(
|
||||
numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
|
||||
left.bytes_stored(), left.compute_cycles()) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
|
||||
|
||||
private:
|
||||
|
@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < packetSize; ++i) {
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffColMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < packetSize; ++i) {
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffRowMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = TensorOpCost::AddCost<Index>();
|
||||
if (NumDims > 0) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
compute_cost += TensorOpCost::DivCost<Index>();
|
||||
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
|
||||
compute_cost += TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>() +
|
||||
TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
|
@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
@ -180,9 +184,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
m_inputOffset = m_stride * op.offset();
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -202,17 +203,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(m_stride == 1);
|
||||
Index inputIndex = index * m_inputStride + m_inputOffset;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = m_impl.coeff(inputIndex);
|
||||
inputIndex += m_inputStride;
|
||||
}
|
||||
@ -226,13 +226,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
} else {
|
||||
const Index idx = index / m_stride;
|
||||
const Index rem = index - idx * m_stride;
|
||||
if (rem + packetSize <= m_stride) {
|
||||
if (rem + PacketSize <= m_stride) {
|
||||
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
|
||||
return m_impl.template packet<LoadMode>(inputIndex);
|
||||
} else {
|
||||
// Cross the stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index);
|
||||
++index;
|
||||
}
|
||||
@ -242,6 +242,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double cost = 0;
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1)) {
|
||||
cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == 0)) {
|
||||
cost += TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
|
||||
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
|
||||
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
|
||||
@ -298,6 +320,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -309,9 +334,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
@ -320,17 +342,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(this->m_stride == 1);
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->m_impl.coeffRef(inputIndex) = values[i];
|
||||
inputIndex += this->m_inputStride;
|
||||
}
|
||||
@ -342,14 +363,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
} else {
|
||||
const Index idx = index / this->m_stride;
|
||||
const Index rem = index - idx * this->m_stride;
|
||||
if (rem + packetSize <= this->m_stride) {
|
||||
if (rem + PacketSize <= this->m_stride) {
|
||||
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
|
||||
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
|
||||
} else {
|
||||
// Cross stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index) = values[i];
|
||||
++index;
|
||||
}
|
||||
|
@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>());
|
||||
const double lhs_size = m_leftImpl.dimensions().TotalSize();
|
||||
const double rhs_size = m_rightImpl.dimensions().TotalSize();
|
||||
return (lhs_size / (lhs_size + rhs_size)) *
|
||||
m_leftImpl.costPerCoeff(vectorized) +
|
||||
(rhs_size / (lhs_size + rhs_size)) *
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -440,6 +440,10 @@ struct TensorContractionEvaluatorBase
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
|
@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename TargetType, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
return converter.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
|
||||
if (vectorized) {
|
||||
const double SrcCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
|
||||
const double TgtCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
|
||||
return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
|
||||
TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
|
||||
} else {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||
@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
|
||||
{
|
||||
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
Index indices[2] = {index, index+PacketSize-1};
|
||||
Index startInputs[2] = {0, 0};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
// We ignore the use of fused multiply-add.
|
||||
const double convolve_compute_cost =
|
||||
TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
|
||||
const double firstIndex_compute_cost =
|
||||
NumDims *
|
||||
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
|
||||
m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized,
|
||||
PacketSize));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -1044,6 +1061,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
|
||||
// model.
|
||||
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
// We ignore the use of fused multiply-add.
|
||||
const double convolve_compute_cost =
|
||||
TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
|
||||
const double firstIndex_compute_cost =
|
||||
NumDims *
|
||||
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
|
||||
m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized,
|
||||
PacketSize));
|
||||
}
|
||||
|
||||
private:
|
||||
// No assignment (copies are needed by the kernels)
|
||||
TensorEvaluator& operator = (const TensorEvaluator&);
|
||||
|
214
unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
Normal file
214
unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
Normal file
@ -0,0 +1,214 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
|
||||
#if !defined(EIGEN_USE_GPU)
|
||||
#define EIGEN_USE_COST_MODEL
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorEvaluator
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A cost model used to limit the number of threads used for evaluating
|
||||
* tensor expression.
|
||||
*
|
||||
*/
|
||||
|
||||
// Class storing the cost of evaluating a tensor expression in terms of the
|
||||
// estimated number of operand bytes loads, bytes stored, and compute cycles.
|
||||
class TensorOpCost {
|
||||
public:
|
||||
// TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
|
||||
// model based on minimal reciprocal throughput numbers from Intel or
|
||||
// Agner Fog's tables would be better than what is there now.
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_product_op<ArgType, ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
|
||||
return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_quotient_op<ArgType, ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
|
||||
return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost;
|
||||
}
|
||||
template <typename SrcType, typename TargetType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_cast_op<SrcType, TargetType>>::Cost;
|
||||
}
|
||||
|
||||
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(compute_cycles) {}
|
||||
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
|
||||
bool vectorized, double packet_size)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(vectorized ? compute_cycles / packet_size
|
||||
: compute_cycles) {
|
||||
using std::isfinite;
|
||||
eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
|
||||
eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
|
||||
eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
|
||||
return bytes_loaded_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
|
||||
return bytes_stored_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
|
||||
return compute_cycles_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
|
||||
double load_cost, double store_cost, double compute_cost) const {
|
||||
return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
|
||||
compute_cost * compute_cycles_;
|
||||
}
|
||||
|
||||
// Drop memory access component. Intended for cases when memory accesses are
|
||||
// sequential or are completely masked by computations.
|
||||
EIGEN_DEVICE_FUNC void dropMemoryCost() {
|
||||
bytes_loaded_ = 0;
|
||||
bytes_stored_ = 0;
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
|
||||
bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
|
||||
compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
|
||||
bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
|
||||
compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ += rhs.bytes_loaded();
|
||||
bytes_stored_ += rhs.bytes_stored();
|
||||
compute_cycles_ += rhs.compute_cycles();
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
|
||||
bytes_loaded_ *= rhs;
|
||||
bytes_stored_ *= rhs;
|
||||
compute_cycles_ *= rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
|
||||
TensorOpCost lhs, const TensorOpCost& rhs) {
|
||||
lhs += rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
TensorOpCost lhs, double rhs) {
|
||||
lhs *= rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
double lhs, TensorOpCost rhs) {
|
||||
rhs *= lhs;
|
||||
return rhs;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
|
||||
return os << "[bytes_loaded = " << tc.bytes_loaded()
|
||||
<< ", bytes_stored = " << tc.bytes_stored()
|
||||
<< ", compute_cycles = " << tc.compute_cycles() << "]";
|
||||
}
|
||||
|
||||
private:
|
||||
double bytes_loaded_;
|
||||
double bytes_stored_;
|
||||
double compute_cycles_;
|
||||
};
|
||||
|
||||
// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
|
||||
// in [1:max_threads] instead of just switching multi-threading off for small
|
||||
// work units.
|
||||
template <typename Device>
|
||||
class TensorCostModel {
|
||||
public:
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
static const int kDeviceCyclesPerComputeCycle = 1;
|
||||
|
||||
// Costs in device cycles.
|
||||
static const int kStartupCycles = 100000;
|
||||
static const int kPerThreadCycles = 100000;
|
||||
static const int kTaskSize = 40000;
|
||||
|
||||
// Returns the number of threads in [1:max_threads] to use for
|
||||
// evaluating an expression with the given output size and cost per
|
||||
// coefficient.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
|
||||
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
|
||||
double cost = totalCost(output_size, cost_per_coeff);
|
||||
int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
|
||||
return numext::mini(max_threads, numext::maxi(1, threads));
|
||||
}
|
||||
|
||||
// taskSize assesses parallel task size.
|
||||
// Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
|
||||
// granularity needs to be increased to mitigate parallelization overheads.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
return totalCost(output_size, cost_per_coeff) / kTaskSize;
|
||||
}
|
||||
|
||||
private:
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
// Cost of memory fetches from L2 cache. 64 is typical cache line size.
|
||||
// 11 is L2 cache latency on Haswell.
|
||||
// We don't know whether data is in L1, L2 or L3. But we are most interested
|
||||
// in single-threaded computational time around 100us-10ms (smaller time
|
||||
// is too small for parallelization, larger time is not intersting
|
||||
// either because we are probably using all available threads already).
|
||||
// And for the target time range, L2 seems to be what matters. Data set
|
||||
// fitting into L1 is too small to take noticeable time. Data set fitting
|
||||
// only into L3 presumably will take more than 10ms to load and process.
|
||||
const double kLoadCycles = 1.0 / 64 * 11;
|
||||
const double kStoreCycles = 1.0 / 64 * 11;
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
return output_size *
|
||||
cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
|
||||
kDeviceCyclesPerComputeCycle);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
typedef typename internal::traits<ArgType>::Index Index;
|
||||
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef
|
||||
typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
|
||||
typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
m_dimensions = op.func().dimensions(op.expression());
|
||||
}
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
static const int NumDims = internal::traits<XprType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
|
||||
}
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
|
@ -88,6 +88,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
typedef TensorEvalToOp<ArgType> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
|
||||
@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here.
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
|
||||
|
||||
private:
|
||||
|
@ -101,6 +101,11 @@ struct TensorEvaluator
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
|
||||
|
||||
protected:
|
||||
@ -219,6 +224,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
@ -237,6 +243,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
return m_functor.template packetOp<Index, PacketReturnType>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -270,6 +282,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
@ -293,6 +306,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
|
||||
return m_argImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -330,6 +349,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
@ -358,6 +378,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
|
||||
return m_leftImpl.costPerCoeff(vectorized) +
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -398,6 +426,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
@ -435,6 +464,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
m_elseImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_condImpl.costPerCoeff(vectorized) +
|
||||
m_thenImpl.costPerCoeff(vectorized)
|
||||
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
|
@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
|
||||
typedef OutputScalar CoeffReturnType;
|
||||
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_data) {
|
||||
m_device.deallocate(m_data);
|
||||
@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
|
||||
template <int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
|
||||
packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
|
||||
|
||||
|
||||
|
@ -83,6 +83,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
typedef TensorForcedEvalOp<ArgType> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
|
||||
|
||||
private:
|
||||
|
@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > {
|
||||
template <typename Scalar>
|
||||
struct functor_traits<UniformRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator {
|
||||
template <typename Scalar>
|
||||
struct functor_traits<NormalRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
@ -807,6 +811,15 @@ class GaussianGenerator {
|
||||
array<T, NumDims> m_two_sigmas;
|
||||
};
|
||||
|
||||
template <typename T, typename Index, size_t NumDims>
|
||||
struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
|
||||
enum {
|
||||
Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
|
||||
functor_traits<scalar_quotient_op<T, T> >::Cost) +
|
||||
functor_traits<scalar_exp_op<T> >::Cost,
|
||||
PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): This is just a placeholder. Define interface to make
|
||||
// generators return their cost.
|
||||
return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
|
||||
TensorOpCost::MulCost<Scalar>());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
|
||||
Device> Self;
|
||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + packetSize - 1};
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
@ -434,12 +433,24 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We conservatively estimate the cost for the code path where the computed
|
||||
// index is inside the original image and
|
||||
// TensorEvaluator<ArgType, Device>::CoordAccess is false.
|
||||
const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
|
||||
6 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::MulCost<Index>();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
const double input_size = m_impl.dimensions().TotalSize();
|
||||
const double output_size = m_dimensions.TotalSize();
|
||||
if (output_size == 0)
|
||||
return TensorOpCost();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
|
||||
compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
||||
Scalar* result = m_impl.data();
|
||||
if (result) {
|
||||
|
@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<PaddingDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i)
|
||||
updateCostPerDimension(cost, i, i == 0);
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i)
|
||||
updateCostPerDimension(cost, i, i == NumDims - 1);
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
|
||||
const double in = static_cast<double>(m_impl.dimensions()[i]);
|
||||
const double out = in + m_padding[i].first + m_padding[i].second;
|
||||
if (out == 0)
|
||||
return;
|
||||
const double reduction = in / out;
|
||||
cost *= reduction;
|
||||
if (first) {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
reduction * (1 * TensorOpCost::AddCost<Index>()));
|
||||
} else {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
reduction * (2 * TensorOpCost::MulCost<Index>() +
|
||||
1 * TensorOpCost::DivCost<Index>()));
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index first = index;
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||
const Index lastPaddedRight = m_outputStrides[i+1];
|
||||
@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[0].first;
|
||||
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||
@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index first = index;
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
|
||||
const Index lastPaddedRight = m_outputStrides[i];
|
||||
@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[NumDims-1].first;
|
||||
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
|
||||
@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
|
||||
Index indices[2] = {index, index + packetSize - 1};
|
||||
Index indices[2] = {index, index + PacketSize - 1};
|
||||
Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
|
||||
indices[1] / m_outputStrides[output_stride_index]};
|
||||
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
|
||||
@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
|
||||
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
|
||||
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -411,6 +411,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
|
||||
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -495,9 +498,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
|
||||
@ -584,16 +584,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
if (ReducingInnerMostDims) {
|
||||
const Index num_values_to_reduce =
|
||||
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
|
||||
const Index firstIndex = firstInput(index);
|
||||
for (Index i = 0; i < packetSize; ++i) {
|
||||
for (Index i = 0; i < PacketSize; ++i) {
|
||||
Op reducer(m_reducer);
|
||||
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
|
||||
num_values_to_reduce, reducer);
|
||||
@ -602,18 +601,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
const Index firstIndex = firstInput(index);
|
||||
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
|
||||
// TBD: extend this the the n innermost dimensions that we preserve.
|
||||
if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
|
||||
if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
|
||||
Op reducer(m_reducer);
|
||||
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
|
||||
return reducer.finalizePacket(accum);
|
||||
} else {
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
@ -621,6 +620,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
return rslt;
|
||||
}
|
||||
|
||||
// Must be called after evalSubExprsIfNeeded().
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
if (RunningFullReduction && m_result) {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
} else {
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
|
||||
return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
|
@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<ReverseDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// TODO(ndjaitly): write a better packing routine that uses
|
||||
// local structure.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
|
||||
values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (m_reverse[i]) {
|
||||
compute_cost += 2 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x) {
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// This code is pilfered from TensorMorphing.h
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / m_outputStrides[i];
|
||||
@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -209,6 +208,14 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -266,6 +273,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
@ -275,12 +283,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / this->m_outputStrides[i];
|
||||
@ -304,15 +311,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX Scalar values[packetSize];
|
||||
EIGEN_ALIGN_MAX Scalar values[PacketSize];
|
||||
internal::pstore<Scalar, PacketReturnType>(values, x);
|
||||
this->m_impl.coeffRef(inputIndices[0]) = values[0];
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
static const int NumDims = NumInputDims + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
|
||||
m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + packetSize - 1};
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost =
|
||||
10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::AddCost<Index>();
|
||||
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
@ -518,9 +525,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
Loading…
x
Reference in New Issue
Block a user