Pulled latest updates from trunk

This commit is contained in:
Benoit Steiner 2015-04-01 23:24:11 -07:00
commit 74e558cfa8
52 changed files with 1088 additions and 627 deletions

View File

@ -17,7 +17,7 @@
*
* These iterative solvers are associated with some preconditioners:
* - IdentityPreconditioner - not really useful
* - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
* - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
* - IncompleteLUT - incomplete LU factorization with dual thresholding
*
* Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.

View File

@ -300,9 +300,10 @@ template<typename Derived>
bool DenseBase<Derived>::isApproxToConstant
(const Scalar& val, const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j)
for(Index i = 0; i < rows(); ++i)
if(!internal::isApprox(this->coeff(i, j), val, prec))
if(!internal::isApprox(self.coeff(i, j), val, prec))
return false;
return true;
}
@ -484,9 +485,10 @@ DenseBase<Derived>::Zero()
template<typename Derived>
bool DenseBase<Derived>::isZero(const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j)
for(Index i = 0; i < rows(); ++i)
if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
return false;
return true;
}
@ -719,18 +721,19 @@ template<typename Derived>
bool MatrixBase<Derived>::isIdentity
(const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j)
{
for(Index i = 0; i < rows(); ++i)
{
if(i == j)
{
if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
return false;
}
else
{
if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
return false;
}
}

View File

@ -35,22 +35,22 @@ void check_static_allocation_size()
}
template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
bool Match = bool((Size%unpacket_traits<Packet>::size)==0),
bool TryHalf = bool(int(unpacket_traits<Packet>::size) > Size)
bool Match = bool((Size%unpacket_traits<Packet>::size)==0),
bool TryHalf = bool(int(unpacket_traits<Packet>::size) > 1)
&& bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) >
struct compute_default_alignment
{
enum { value = 0 };
};
template<typename T, int Size, typename Packet>
struct compute_default_alignment<T, Size, Packet, true, false> // Match
template<typename T, int Size, typename Packet, bool TryHalf>
struct compute_default_alignment<T, Size, Packet, true, TryHalf> // Match
{
enum { value = sizeof(T) * unpacket_traits<Packet>::size };
};
template<typename T, int Size, typename Packet>
struct compute_default_alignment<T, Size, Packet, false, true>
struct compute_default_alignment<T, Size, Packet, false, true> // Try-half
{
// current packet too large, try with an half-packet
enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value };

View File

@ -224,13 +224,13 @@ bool MatrixBase<Derived>::isOrthogonal
template<typename Derived>
bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
{
typename Derived::Nested nested(derived());
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index i = 0; i < cols(); ++i)
{
if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
return false;
for(Index j = 0; j < i; ++j)
if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
return false;
}
return true;

View File

@ -328,6 +328,7 @@ struct hypot_impl
p = _y;
qp = _x / p;
}
if(p==RealScalar(0)) return RealScalar(0);
return p * sqrt(RealScalar(1) + qp*qp);
}
};

View File

@ -409,7 +409,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
: (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,
@ -484,7 +485,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
{
PacketScalar res;
typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
Unroll ? InnerSize-1 : Dynamic,
Unroll ? InnerSize : Dynamic,
LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
@ -527,7 +528,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
}
};
@ -537,12 +538,12 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{
etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
@ -551,7 +552,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{
@ -559,14 +560,31 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(0);
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{
res = pset1<Packet>(0);
}
};
template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
eigen_assert(innerDim>0 && "you are using a non initialized matrix");
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
for(Index i = 1; i < innerDim; ++i)
res = pset1<Packet>(0);
for(Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
}
};
@ -576,9 +594,8 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{
eigen_assert(innerDim>0 && "you are using a non initialized matrix");
res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
for(Index i = 1; i < innerDim; ++i)
res = pset1<Packet>(0);
for(Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
}
};
@ -678,8 +695,7 @@ public:
//_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
//(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0)
};
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)

View File

@ -200,17 +200,82 @@ DenseBase<Derived>::reverse() const
* In most cases it is probably better to simply use the reversed expression
* of a matrix. However, when reversing the matrix data itself is really needed,
* then this "in-place" version is probably the right choice because it provides
* the following additional features:
* the following additional benefits:
* - less error prone: doing the same operation with .reverse() requires special care:
* \code m = m.reverse().eval(); \endcode
* - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
* - this API enables reverse operations without the need for a temporary
* - it allows future optimizations (cache friendliness, etc.)
*
* \sa reverse() */
* \sa VectorwiseOp::reverseInPlace(), reverse() */
template<typename Derived>
inline void DenseBase<Derived>::reverseInPlace()
{
derived() = derived().reverse().eval();
if(cols()>rows())
{
Index half = cols()/2;
leftCols(half).swap(rightCols(half).reverse());
if((cols()%2)==1)
{
Index half2 = rows()/2;
col(half).head(half2).swap(col(half).tail(half2).reverse());
}
}
else
{
Index half = rows()/2;
topRows(half).swap(bottomRows(half).reverse());
if((rows()%2)==1)
{
Index half2 = cols()/2;
row(half).head(half2).swap(row(half).tail(half2).reverse());
}
}
}
namespace internal {
template<int Direction>
struct vectorwise_reverse_inplace_impl;
template<>
struct vectorwise_reverse_inplace_impl<Vertical>
{
template<typename ExpressionType>
static void run(ExpressionType &xpr)
{
Index half = xpr.rows()/2;
xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
}
};
template<>
struct vectorwise_reverse_inplace_impl<Horizontal>
{
template<typename ExpressionType>
static void run(ExpressionType &xpr)
{
Index half = xpr.cols()/2;
xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
}
};
} // end namespace internal
/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
*
* In most cases it is probably better to simply use the reversed expression
* of a matrix. However, when reversing the matrix data itself is really needed,
* then this "in-place" version is probably the right choice because it provides
* the following additional benefits:
* - less error prone: doing the same operation with .reverse() requires special care:
* \code m = m.reverse().eval(); \endcode
* - this API enables reverse operations without the need for a temporary
*
* \sa DenseBase::reverseInPlace(), reverse() */
template<typename ExpressionType, int Direction>
void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
{
internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
}
} // end namespace Eigen

View File

@ -38,13 +38,17 @@ public:
template<int StoreMode, int LoadMode>
void assignPacket(Index row, Index col)
{
m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
PacketScalar tmp = m_src.template packet<LoadMode>(row,col);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode>(row,col));
m_dst.template writePacket<StoreMode>(row,col,tmp);
}
template<int StoreMode, int LoadMode>
void assignPacket(Index index)
{
m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
PacketScalar tmp = m_src.template packet<LoadMode>(index);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode>(index));
m_dst.template writePacket<StoreMode>(index,tmp);
}
// TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)

View File

@ -562,6 +562,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
void normalize() {
m_matrix = this->normalized();
}
inline void reverseInPlace();
/////////// Geometry module ///////////

View File

@ -150,14 +150,6 @@ template<typename Scalar> struct swap_assign_op {
swap(a,const_cast<Scalar&>(b));
#endif
}
template<int LhsAlignment, int RhsAlignment, typename Packet>
EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
{
Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
}
};
template<typename Scalar>
struct functor_traits<swap_assign_op<Scalar> > {

View File

@ -249,10 +249,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
actual_lm = l2;
max_mc = 576;
}
Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
if (mc > Traits::mr) mc -= mc % Traits::mr;
else if (mc==0) return;
m = (m%mc)==0 ? mc
: (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
}

View File

@ -79,6 +79,14 @@ template <typename LhsScalar,
typename RhsScalar>
bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
{
if (num_threads > 1) {
// We don't currently have lookup tables recorded for multithread performance,
// and we have confirmed experimentally that our single-thread-recorded LUTs are
// poor for multithread performance, and our LUTs don't currently contain
// any annotation about multithread status (FIXME - we need that).
// So for now, we just early-return here.
return false;
}
return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
}

View File

@ -213,7 +213,8 @@
#endif
/// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
#if defined(__ANDROID__)
// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
#if defined(__ANDROID__) || defined(ANDROID)
#define EIGEN_OS_ANDROID 1
#else
#define EIGEN_OS_ANDROID 0

View File

@ -417,7 +417,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
{
Scalar t0 = m_matT.coeff(i+1, i);
Scalar t1 = m_matT.coeff(i, i+1);
Scalar maxval = numext::maxi(abs(p),numext::maxi(abs(t0),abs(t1)));
Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
t0 /= maxval;
t1 /= maxval;
Scalar p0 = p/maxval;
@ -608,7 +608,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
}
// Overflow control
Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
if ((eps * t) * t > Scalar(1))
m_matT.block(i, n-1, size-i, 2) /= t;

View File

@ -240,10 +240,10 @@ namespace Eigen {
m_S.coeffRef(i,j) = Scalar(0.0);
m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
// update Q
if (m_computeQZ)
m_Q.applyOnTheRight(i-1,i,G);
}
// update Q
if (m_computeQZ)
m_Q.applyOnTheRight(i-1,i,G);
// kill T(i,i-1)
if(m_T.coeff(i,i-1)!=Scalar(0))
{
@ -251,10 +251,10 @@ namespace Eigen {
m_T.coeffRef(i,i-1) = Scalar(0.0);
m_S.applyOnTheRight(i,i-1,G);
m_T.topRows(i).applyOnTheRight(i,i-1,G);
// update Z
if (m_computeQZ)
m_Z.applyOnTheLeft(i,i-1,G.adjoint());
}
// update Z
if (m_computeQZ)
m_Z.applyOnTheLeft(i,i-1,G.adjoint());
}
}
}

View File

@ -161,8 +161,8 @@ class QuaternionBase : public RotationBase<Derived, 3>
bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
{ return coeffs().isApprox(other.coeffs(), prec); }
/** return the result vector of \a v through the rotation*/
EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
/** return the result vector of \a v through the rotation*/
EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
/** \returns \c *this with scalar type casted to \a NewScalarType
*
@ -462,7 +462,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni
*/
template <class Derived>
EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
QuaternionBase<Derived>::_transformVector(Vector3 v) const
QuaternionBase<Derived>::_transformVector(const Vector3& v) const
{
// Note that this algorithm comes from the optimization by hand
// of the conversion to a Matrix followed by a Matrix/Vector product.

View File

@ -84,6 +84,8 @@ public:
typedef Matrix<RealScalar, Dynamic, 1> VectorType;
typedef Array<RealScalar, Dynamic, 1> ArrayXr;
typedef Array<Index,1,Dynamic> ArrayXi;
typedef Ref<ArrayXr> ArrayRef;
typedef Ref<ArrayXi> IndicesRef;
/** \brief Default Constructor.
*
@ -159,21 +161,23 @@ private:
void allocate(Index rows, Index cols, unsigned int computationOptions);
void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
void computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, VectorType& singVals, ArrayXr& shifts, ArrayXr& mus);
void perturbCol0(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat);
void computeSingVecs(const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V);
void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus);
void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
void deflation43(Index firstCol, Index shift, Index i, Index size);
void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev);
static void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
static RealScalar secularEq(RealScalar x, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift);
void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift);
protected:
MatrixXr m_naiveU, m_naiveV;
MatrixXr m_computed;
Index m_nRec;
ArrayXr m_workspace;
ArrayXi m_workspaceI;
int m_algoswap;
bool m_isTranspose, m_compU, m_compV;
@ -212,6 +216,9 @@ void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computati
else m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
m_workspaceI.resize(3*m_diagSize);
}// end allocate
template<typename MatrixType>
@ -223,6 +230,19 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
allocate(matrix.rows(), matrix.cols(), computationOptions);
using std::abs;
//**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
if(matrix.cols() < m_algoswap)
{
// FIXME this line involves temporaries
JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
if(computeU()) m_matrixU = jsvd.matrixU();
if(computeV()) m_matrixV = jsvd.matrixV();
m_singularValues = jsvd.singularValues();
m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
m_isInitialized = true;
return *this;
}
//**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
RealScalar scale = matrix.cwiseAbs().maxCoeff();
if(scale==RealScalar(0)) scale = RealScalar(1);
@ -231,11 +251,13 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
else copy = matrix/scale;
//**** step 1 - Bidiagonalization
// FIXME this line involves temporaries
internal::UpperBidiagonalization<MatrixX> bid(copy);
//**** step 2 - Divide & Conquer
m_naiveU.setZero();
m_naiveV.setZero();
// FIXME this line involves a temporary matrix
m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
m_computed.template bottomRows<1>().setZero();
divide(0, m_diagSize - 1, 0, 0, 0);
@ -257,6 +279,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
break;
}
}
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
// std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
// std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
@ -279,14 +302,14 @@ void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const Househol
Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
householderU.applyThisOnTheLeft(m_matrixU);
householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
}
if (computeV())
{
Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
householderV.applyThisOnTheLeft(m_matrixV);
householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
}
}
@ -307,7 +330,10 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
// If the matrices are large enough, let's exploit the sparse structure of A by
// splitting it in half (wrt n1), and packing the non-zero columns.
Index n2 = n - n1;
MatrixXr A1(n1,n), A2(n2,n), B1(n,n), B2(n,n);
Map<MatrixXr> A1(m_workspace.data() , n1, n);
Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n);
Map<MatrixXr> B1(m_workspace.data()+ n*n, n, n);
Map<MatrixXr> B2(m_workspace.data()+2*n*n, n, n);
Index k1=0, k2=0;
for(Index j=0; j<n; ++j)
{
@ -329,7 +355,11 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
}
else
A *= B; // FIXME this requires a temporary
{
Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n);
tmp.noalias() = A*B;
A = tmp;
}
}
// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the
@ -360,7 +390,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
// matrices.
if (n < m_algoswap)
{
JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)) ;
// FIXME this line involves temporaries
JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
if (m_compU)
m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
else
@ -438,7 +469,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
}
else
{
RealScalar q1 = (m_naiveU(0, firstCol + k));
RealScalar q1 = m_naiveU(0, firstCol + k);
// we shift Q1 to the right
for (Index i = firstCol + k - 1; i >= firstCol; i--)
m_naiveU(0, i + 1) = m_naiveU(0, i);
@ -491,8 +522,14 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
assert(VofSVD.allFinite());
#endif
if (m_compU) structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
else m_naiveU.middleCols(firstCol, n + 1) *= UofSVD; // FIXME this requires a temporary, and exploit that there are 2 rows at compile time
if (m_compU)
structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
else
{
Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1);
tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
m_naiveU.middleCols(firstCol, n + 1) = tmp;
}
if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
@ -517,10 +554,9 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
template <typename MatrixType>
void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
{
// TODO Get rid of these copies (?)
// FIXME at least preallocate them
ArrayXr col0 = m_computed.col(firstCol).segment(firstCol, n);
ArrayXr diag = m_computed.block(firstCol, firstCol, n, n).diagonal();
ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
ArrayRef diag = m_workspace.head(n);
diag(0) = 0;
// Allocate space for singular values and vectors
@ -539,13 +575,14 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
Index actual_n = n;
while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
Index m = 0; // size of the deflated problem
ArrayXi perm(actual_n);
for(Index k=0;k<actual_n;++k)
if(col0(k)!=0)
perm(m++) = k;
perm.conservativeResize(m);
m_workspaceI(m++) = k;
Map<ArrayXi> perm(m_workspaceI.data(),m);
ArrayXr shifts(n), mus(n), zhat(n);
Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
Map<ArrayXr> mus(m_workspace.data()+2*n, n);
Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
std::cout << "computeSVDofM using:\n";
@ -622,8 +659,8 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
// Reverse order so that singular values in increased order
// Because of deflation, the zeros singular-values are already at the end
singVals.head(actual_n).reverseInPlace();
U.leftCols(actual_n) = U.leftCols(actual_n).rowwise().reverse().eval(); // FIXME this requires a temporary
if (m_compV) V.leftCols(actual_n) = V.leftCols(actual_n).rowwise().reverse().eval(); // FIXME this requires a temporary
U.leftCols(actual_n).rowwise().reverseInPlace();
if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
@ -634,7 +671,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
}
template <typename MatrixType>
typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift)
typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
{
Index m = perm.size();
RealScalar res = 1;
@ -647,8 +684,8 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar
}
template <typename MatrixType>
void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm,
VectorType& singVals, ArrayXr& shifts, ArrayXr& mus)
void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm,
VectorType& singVals, ArrayRef shifts, ArrayRef mus)
{
using std::abs;
using std::swap;
@ -703,7 +740,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
// measure everything relative to shift
ArrayXr diagShifted = diag - shift;
Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
diagShifted = diag - shift;
// initial guess
RealScalar muPrev, muCur;
@ -730,7 +768,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
// rational interpolation: fit a function of the form a / mu + b through the two previous
// iterates and use its zero to compute the next iterate
bool useBisection = fPrev*fCur>0;
while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
{
++m_numIters;
@ -773,7 +811,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
}
RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
#endif
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
if(!(fLeft * fRight<0))
@ -781,14 +822,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
#endif
eigen_internal_assert(fLeft * fRight < 0);
while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted)))
while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
{
RealScalar midShifted = (leftShifted + rightShifted) / 2;
RealScalar fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
if (fLeft * fMid < 0)
{
rightShifted = midShifted;
fRight = fMid;
}
else
{
@ -816,8 +856,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
template <typename MatrixType>
void BDCSVD<MatrixType>::perturbCol0
(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat)
(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat)
{
using std::sqrt;
Index n = col0.size();
@ -865,8 +905,8 @@ void BDCSVD<MatrixType>::perturbCol0
// compute singular vectors
template <typename MatrixType>
void BDCSVD<MatrixType>::computeSingVecs
(const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V)
(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V)
{
Index n = zhat.size();
Index m = perm.size();
@ -991,7 +1031,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag);
RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
#ifdef EIGEN_BDCSVD_SANITY_CHECKS
assert(m_naiveU.allFinite());
@ -1047,7 +1087,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
// Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
// First, compute the respective permutation.
Index *permutation = new Index[length]; // FIXME avoid repeated dynamic memory allocation
Index *permutation = m_workspaceI.data();
{
permutation[0] = 0;
Index p = 1;
@ -1084,8 +1124,8 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
}
// Current index of each col, and current column of each index
Index *realInd = new Index[length]; // FIXME avoid repeated dynamic memory allocation
Index *realCol = new Index[length]; // FIXME avoid repeated dynamic memory allocation
Index *realInd = m_workspaceI.data()+length;
Index *realCol = m_workspaceI.data()+2*length;
for(int pos = 0; pos< length; pos++)
{
@ -1115,9 +1155,6 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
realInd[J] = realI;
realInd[i] = pi;
}
delete[] permutation;
delete[] realInd;
delete[] realCol;
}
#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";

View File

@ -425,12 +425,13 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
// If d!=0, then t/d cannot overflow because the magnitude of the
// entries forming d are not too small compared to the ones forming t.
RealScalar u = t / d;
rot1.s() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
rot1.c() = rot1.s() * u;
RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
rot1.s() = RealScalar(1) / tmp;
rot1.c() = u / tmp;
}
m.applyOnTheLeft(0,1,rot1);
j_right->makeJacobi(m,0,1);
*j_left = rot1 * j_right->transpose();
*j_left = rot1 * j_right->transpose();
}
template<typename _MatrixType, int QRPreconditioner>
@ -680,6 +681,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
// limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
// FIXME What about considerering any denormal numbers as zero, using:
// const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
// Scaling factor to reduce over/under-flows
@ -719,8 +722,9 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
// if this 2x2 sub-matrix is not diagonal already...
// notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
// keep us iterating forever. Similarly, small denormal numbers are considered zero.
RealScalar threshold = numext::maxi(considerAsZero, precision * numext::maxi(abs(m_workMatrix.coeff(p,p)),
abs(m_workMatrix.coeff(q,q))));
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
abs(m_workMatrix.coeff(q,q))));
// We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
{

View File

@ -30,16 +30,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
std::memset(mask,0,sizeof(bool)*rows);
typename evaluator<Lhs>::type lhsEval(lhs);
typename evaluator<Rhs>::type rhsEval(rhs);
// estimate the number of non zero entries
// given a rhs column containing Y non zeros, we assume that the respective Y columns
// of the lhs differs in average of one non zeros, thus the number of non zeros for
// the product of a rhs column with the lhs is X+Y where X is the average number of non zero
// per column of the lhs.
// Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
typename evaluator<Lhs>::type lhsEval(lhs);
typename evaluator<Rhs>::type rhsEval(rhs);
Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
res.setZero();
res.reserve(Index(estimated_nnz_prod));

View File

@ -90,7 +90,8 @@ class sparse_matrix_block_impl
typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
public:
enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
protected:
typedef typename Base::IndexVector IndexVector;
enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
@ -198,20 +199,9 @@ public:
{ return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
inline const StorageIndex* innerNonZeroPtr() const
{ return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
{ return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
inline StorageIndex* innerNonZeroPtr()
{ return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
Index nonZeros() const
{
if(m_matrix.isCompressed())
return ( (m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
- (m_matrix.outerIndexPtr()[m_outerStart]));
else if(m_outerSize.value()==0)
return 0;
else
return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
}
{ return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
@ -233,7 +223,7 @@ public:
const Scalar& lastCoeff() const
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
eigen_assert(nonZeros()>0);
eigen_assert(Base::nonZeros()>0);
if(m_matrix.isCompressed())
return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
else
@ -339,17 +329,6 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
}
namespace internal {
template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
bool OuterVector = (BlockCols==1 && XprType::IsRowMajor)
| // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
// revert to || as soon as not needed anymore.
(BlockRows==1 && !XprType::IsRowMajor)>
class GenericSparseBlockInnerIteratorImpl;
}
/** Generic implementation of sparse Block expression.
* Real-only.
*/
@ -415,8 +394,11 @@ public:
Index blockCols() const { return m_blockCols.value(); }
protected:
friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
// friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
friend class ReverseInnerIterator;
friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
Index nonZeros() const { return Dynamic; }
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
@ -429,94 +411,6 @@ public:
};
namespace internal {
template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,false> : public Block<XprType, BlockRows, BlockCols, InnerPanel>::_MatrixTypeNested::InnerIterator
{
typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
enum {
IsRowMajor = BlockType::IsRowMajor
};
typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
typedef typename BlockType::StorageIndex StorageIndex;
typedef typename _MatrixTypeNested::InnerIterator Base;
const BlockType& m_block;
Index m_end;
public:
EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer)
: Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
m_block(block),
m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
{
while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
Base::operator++();
}
inline Index index() const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
inline Index outer() const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
inline Index row() const { return Base::row() - m_block.m_startRow.value(); }
inline Index col() const { return Base::col() - m_block.m_startCol.value(); }
inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
};
// Row vector of a column-major sparse matrix or column of a row-major one.
template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,true>
{
typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
enum {
IsRowMajor = BlockType::IsRowMajor
};
typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
typedef typename BlockType::StorageIndex StorageIndex;
typedef typename BlockType::Scalar Scalar;
const BlockType& m_block;
Index m_outerPos;
Index m_innerIndex;
Scalar m_value;
Index m_end;
public:
explicit EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer = 0)
:
m_block(block),
m_outerPos( (IsRowMajor ? block.m_startCol.value() : block.m_startRow.value()) - 1), // -1 so that operator++ finds the first non-zero entry
m_innerIndex(IsRowMajor ? block.m_startRow.value() : block.m_startCol.value()),
m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
{
EIGEN_UNUSED_VARIABLE(outer);
eigen_assert(outer==0);
++(*this);
}
inline Index index() const { return m_outerPos - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
inline Index outer() const { return 0; }
inline Index row() const { return IsRowMajor ? 0 : index(); }
inline Index col() const { return IsRowMajor ? index() : 0; }
inline Scalar value() const { return m_value; }
inline GenericSparseBlockInnerIteratorImpl& operator++()
{
// search next non-zero entry
while(++m_outerPos<m_end)
{
typename XprType::InnerIterator it(m_block.m_matrix, m_outerPos);
// search for the key m_innerIndex in the current outer-vector
while(it && it.index() < m_innerIndex) ++it;
if(it && it.index()==m_innerIndex)
{
m_value = it.value();
break;
}
}
return *this;
}
inline operator bool() const { return m_outerPos < m_end; }
};
template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
@ -548,9 +442,16 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
explicit unary_evaluator(const XprType& op)
: m_argImpl(op.nestedExpression()), m_block(op)
{}
inline Index nonZerosEstimate() const {
Index nnz = m_block.nonZeros();
if(nnz<0)
return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
return nnz;
}
protected:
typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
typename evaluator<ArgType>::nestedType m_argImpl;
const XprType &m_block;
@ -595,6 +496,7 @@ public:
: m_eval(aEval),
m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
m_value(0),
m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
{
EIGEN_UNUSED_VARIABLE(outer);

View File

@ -35,6 +35,25 @@ class SparseCompressedBase
class InnerIterator;
class ReverseInnerIterator;
protected:
typedef typename Base::IndexVector IndexVector;
Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
const Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
public:
/** \returns the number of non zero coefficients */
inline Index nonZeros() const
{
if(isCompressed())
return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
else if(derived().outerSize()==0)
return 0;
else
return innerNonZeros().sum();
}
/** \returns a const pointer to the array of values.
* This function is aimed at interoperability with other libraries.
* \sa innerIndexPtr(), outerIndexPtr() */
@ -165,6 +184,10 @@ struct evaluator<SparseCompressedBase<Derived> >
evaluator() : m_matrix(0) {}
explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
inline Index nonZerosEstimate() const {
return m_matrix->nonZeros();
}
operator Derived&() { return m_matrix->const_cast_derived(); }
operator const Derived&() const { return *m_matrix; }

View File

@ -121,6 +121,10 @@ public:
m_lhsImpl(xpr.lhs()),
m_rhsImpl(xpr.rhs())
{ }
inline Index nonZerosEstimate() const {
return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
}
protected:
const BinaryOp m_functor;
@ -198,6 +202,10 @@ public:
m_lhsImpl(xpr.lhs()),
m_rhsImpl(xpr.rhs())
{ }
inline Index nonZerosEstimate() const {
return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
}
protected:
const BinaryOp m_functor;
@ -243,7 +251,7 @@ public:
EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
protected:
const LhsEvaluator &m_lhsEval;
RhsIterator m_rhsIter;
@ -262,6 +270,10 @@ public:
m_lhsImpl(xpr.lhs()),
m_rhsImpl(xpr.rhs())
{ }
inline Index nonZerosEstimate() const {
return m_rhsImpl.nonZerosEstimate();
}
protected:
const BinaryOp m_functor;
@ -308,7 +320,7 @@ public:
EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
protected:
LhsIterator m_lhsIter;
const RhsEvaluator &m_rhsEval;
@ -327,6 +339,10 @@ public:
m_lhsImpl(xpr.lhs()),
m_rhsImpl(xpr.rhs())
{ }
inline Index nonZerosEstimate() const {
return m_lhsImpl.nonZerosEstimate();
}
protected:
const BinaryOp m_functor;

View File

@ -30,6 +30,10 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
};
explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {}
inline Index nonZerosEstimate() const {
return m_argImpl.nonZerosEstimate();
}
protected:
typedef typename evaluator<ArgType>::InnerIterator EvalIterator;

View File

@ -105,9 +105,6 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
}
/** \returns the number of non zero coefficients */
inline Index nonZeros() const { return m_nnz; }
inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
: m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),

View File

@ -95,6 +95,7 @@ class SparseMatrix
public:
typedef SparseCompressedBase<SparseMatrix> Base;
using Base::isCompressed;
using Base::nonZeros;
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
@ -122,9 +123,6 @@ class SparseMatrix
StorageIndex* m_outerIndex;
StorageIndex* m_innerNonZeros; // optional, if null then the data is compressed
Storage m_data;
Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
const Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
public:
@ -252,14 +250,6 @@ class SparseMatrix
memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
}
/** \returns the number of non zero coefficients */
inline Index nonZeros() const
{
if(m_innerNonZeros)
return innerNonZeros().sum();
return convert_index(Index(m_data.size()));
}
/** Preallocates \a reserveSize non zeros.
*
* Precondition: the matrix must be in compressed mode. */

View File

@ -149,9 +149,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
/** \returns the number of coefficients, which is \a rows()*cols().
* \sa rows(), cols(). */
inline Index size() const { return rows() * cols(); }
/** \returns the number of nonzero coefficients which is in practice the number
* of stored coefficients. */
inline Index nonZeros() const { return derived().nonZeros(); }
/** \returns true if either the number of rows or the number of columns is equal to 1.
* In other words, this function returns
* \code rows()==1 || cols()==1 \endcode

View File

@ -33,14 +33,6 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
// allocate a temporary buffer
AmbiVector<Scalar,StorageIndex> tempVector(rows);
// estimate the number of non zero entries
// given a rhs column containing Y non zeros, we assume that the respective Y columns
// of the lhs differs in average of one non zeros, thus the number of non zeros for
// the product of a rhs column with the lhs is X+Y where X is the average number of non zero
// per column of the lhs.
// Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
// mimics a resizeByInnerOuter:
if(ResultType::IsRowMajor)
res.resize(cols, rows);
@ -49,6 +41,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
typename evaluator<Lhs>::type lhsEval(lhs);
typename evaluator<Rhs>::type rhsEval(rhs);
// estimate the number of non zero entries
// given a rhs column containing Y non zeros, we assume that the respective Y columns
// of the lhs differs in average of one non zeros, thus the number of non zeros for
// the product of a rhs column with the lhs is X+Y where X is the average number of non zero
// per column of the lhs.
// Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
res.reserve(estimated_nnz_prod);
double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());

View File

@ -40,15 +40,11 @@ namespace internal {
};
}
// Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
// Perhaps it should be implemented in Transpose<> itself.
template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
: public internal::SparseTransposeImpl<MatrixType>
{
protected:
typedef internal::SparseTransposeImpl<MatrixType> Base;
public:
inline Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
};
namespace internal {
@ -61,6 +57,10 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased>
typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
public:
typedef Transpose<ArgType> XprType;
inline Index nonZerosEstimate() const {
return m_argImpl.nonZerosEstimate();
}
class InnerIterator : public EvalIterator
{

View File

@ -50,13 +50,6 @@ protected:
template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
inline Index nonZeros() const {
// FIXME HACK number of nonZeros is required for product logic
// this returns only an upper bound (but should be OK for most purposes)
return derived().nestedExpression().nonZeros();
}
};
@ -191,6 +184,10 @@ public:
explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {}
inline Index nonZerosEstimate() const {
return m_argImpl.nonZerosEstimate();
}
class InnerIterator : public EvalIterator
{
typedef EvalIterator Base;

View File

@ -442,6 +442,10 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
explicit evaluator(const SparseVectorType &mat) : m_matrix(mat) {}
inline Index nonZerosEstimate() const {
return m_matrix.nonZeros();
}
operator SparseVectorType&() { return m_matrix.const_cast_derived(); }
operator const SparseVectorType&() const { return m_matrix; }

View File

@ -302,6 +302,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
typedef Matrix<Scalar,Dynamic,1> Vector;
typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
typedef SparseMatrix<Scalar> LUMatrixType;
public:
@ -459,10 +460,11 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
typedef typename Base::RealScalar RealScalar;
typedef typename Base::StorageIndex StorageIndex;
typedef typename Base::IntRowVectorType IntRowVectorType;
typedef typename Base::IntColVectorType IntColVectorType;
typedef typename Base::IntColVectorType IntColVectorType;
typedef typename Base::PermutationMap PermutationMap;
typedef typename Base::LUMatrixType LUMatrixType;
typedef TriangularView<LUMatrixType, Lower|UnitDiag> LMatrixType;
typedef TriangularView<LUMatrixType, Upper> UMatrixType;
typedef TriangularView<LUMatrixType, Upper> UMatrixType;
public:
using Base::_solve_impl;
@ -774,6 +776,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
det *= m_u.valuePtr()[lastId];
}
}
if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
det = -det;
if(m_sluEqued!='N')
return det/m_sluRscale.prod()/m_sluCscale.prod();
else

View File

@ -26,7 +26,7 @@ macro(_metis_check_version)
string(REGEX MATCH "define[ \t]+METIS_VER_SUBMINOR[ \t]+([0-9]+)" _metis_subminor_version_match "${_metis_version_header}")
set(METIS_SUBMINOR_VERSION "${CMAKE_MATCH_1}")
if(NOT METIS_MAJOR_VERSION)
message(WARNING "Could not determine Metis version. Assuming version 4.0.0")
message(STATUS "Could not determine Metis version. Assuming version 4.0.0")
set(METIS_VERSION 4.0.0)
else()
set(METIS_VERSION ${METIS_MAJOR_VERSION}.${METIS_MINOR_VERSION}.${METIS_SUBMINOR_VERSION})

View File

@ -10,9 +10,10 @@ if(QT4_FOUND)
target_link_libraries(Tutorial_sparse_example ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${QT_QTCORE_LIBRARY} ${QT_QTGUI_LIBRARY})
add_custom_command(
TARGET Tutorial_sparse_example
POST_BUILD
COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg
TARGET Tutorial_sparse_example
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/../html/
COMMAND Tutorial_sparse_example ARGS ${CMAKE_CURRENT_BINARY_DIR}/../html/Tutorial_sparse_example.jpeg
)
add_dependencies(all_examples Tutorial_sparse_example)

View File

@ -332,3 +332,8 @@ endif(EIGEN_TEST_NVCC)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)
add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF)
IF(EIGEN_TEST_BUILD_DOCUMENTATION)
add_dependencies(buildtests doc)
ENDIF()

View File

@ -24,7 +24,7 @@ template<typename MatrixType> void reverse(const MatrixType& m)
// this test relies a lot on Random.h, and there's not much more that we can do
// to test it, hence I consider that we will have tested Random.h
MatrixType m1 = MatrixType::Random(rows, cols);
MatrixType m1 = MatrixType::Random(rows, cols), m2;
VectorType v1 = VectorType::Random(rows);
MatrixType m1_r = m1.reverse();
@ -96,6 +96,26 @@ template<typename MatrixType> void reverse(const MatrixType& m)
m1.reverse()(r, c) = x;
VERIFY_IS_APPROX(x, m1(rows - 1 - r, cols - 1 - c));
m2 = m1;
m2.reverseInPlace();
VERIFY_IS_APPROX(m2,m1.reverse().eval());
m2 = m1;
m2.col(0).reverseInPlace();
VERIFY_IS_APPROX(m2.col(0),m1.col(0).reverse().eval());
m2 = m1;
m2.row(0).reverseInPlace();
VERIFY_IS_APPROX(m2.row(0),m1.row(0).reverse().eval());
m2 = m1;
m2.rowwise().reverseInPlace();
VERIFY_IS_APPROX(m2,m1.rowwise().reverse().eval());
m2 = m1;
m2.colwise().reverseInPlace();
VERIFY_IS_APPROX(m2,m1.colwise().reverse().eval());
/*
m1.colwise().reverse()(r, c) = x;
@ -113,11 +133,11 @@ void test_array_reverse()
CALL_SUBTEST_2( reverse(Matrix2f()) );
CALL_SUBTEST_3( reverse(Matrix4f()) );
CALL_SUBTEST_4( reverse(Matrix4d()) );
CALL_SUBTEST_5( reverse(MatrixXcf(3, 3)) );
CALL_SUBTEST_6( reverse(MatrixXi(6, 3)) );
CALL_SUBTEST_7( reverse(MatrixXcd(20, 20)) );
CALL_SUBTEST_5( reverse(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
CALL_SUBTEST_6( reverse(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(6,3)) );
CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
}
#ifdef EIGEN_TEST_PART_3
Vector4f x; x << 1, 2, 3, 4;

View File

@ -17,6 +17,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
typedef Matrix<Scalar, Rows, 1> VectorType;
typedef Matrix<Scalar, 1, Cols> RowVectorType;
typedef Matrix<Scalar, Rows, Rows> SquareMatrixType;
typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
typedef DiagonalMatrix<Scalar, Rows> LeftDiagonalMatrix;
typedef DiagonalMatrix<Scalar, Cols> RightDiagonalMatrix;
typedef Matrix<Scalar, Rows==Dynamic?Dynamic:2*Rows, Cols==Dynamic?Dynamic:2*Cols> BigMatrix;
@ -64,6 +65,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
VERIFY_IS_APPROX( (((v1+v2).asDiagonal() * (m1+m2))(i,j)) , (v1+v2)(i) * (m1+m2)(i,j) );
VERIFY_IS_APPROX( ((m1 * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * m1(i,j) );
VERIFY_IS_APPROX( (((m1+m2) * (rv1+rv2).asDiagonal())(i,j)) , (rv1+rv2)(j) * (m1+m2)(i,j) );
if(rows>1)
{
DynMatrixType tmp = m1.topRows(rows/2), res;
VERIFY_IS_APPROX( (res = m1.topRows(rows/2) * rv1.asDiagonal()), tmp * rv1.asDiagonal() );
VERIFY_IS_APPROX( (res = v1.head(rows/2).asDiagonal()*m1.topRows(rows/2)), v1.head(rows/2).asDiagonal()*tmp );
}
BigMatrix big;
big.setZero(2*rows, 2*cols);
@ -93,6 +101,17 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
}
template<int>
void bug987()
{
Matrix3Xd points = Matrix3Xd::Random(3, 3);
Vector2d diag = Vector2d::Random();
Matrix2Xd tmp1 = points.topRows<2>(), res1, res2;
VERIFY_IS_APPROX( res1 = diag.asDiagonal() * points.topRows<2>(), res2 = diag.asDiagonal() * tmp1 );
Matrix2d tmp2 = points.topLeftCorner<2,2>();
VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
}
void test_diagonalmatrices()
{
for(int i = 0; i < g_repeat; i++) {
@ -106,4 +125,5 @@ void test_diagonalmatrices()
CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
}
CALL_SUBTEST_10( bug987<0>() );
}

View File

@ -95,6 +95,9 @@
namespace Eigen
{
static std::vector<std::string> g_test_stack;
// level == 0 <=> abort if test fail
// level >= 1 <=> warning message to std::cerr if test fail
static int g_test_level = 0;
static int g_repeat;
static unsigned int g_seed;
static bool g_has_set_repeat, g_has_set_seed;
@ -229,6 +232,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
{
if (!condition)
{
if(Eigen::g_test_level>0)
std::cerr << "WARNING: ";
std::cerr << "Test " << testname << " failed in " << file << " (" << line << ")"
<< std::endl << " " << condition_as_string << std::endl;
std::cerr << "Stack:\n";
@ -236,7 +241,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
for(int i=test_stack_size-1; i>=0; --i)
std::cerr << " - " << Eigen::g_test_stack[i] << "\n";
std::cerr << "\n";
abort();
if(Eigen::g_test_level==0)
abort();
}
}

View File

@ -113,6 +113,9 @@ void mat_mat_scalar_scalar_product()
template <typename MatrixType>
void zero_sized_objects(const MatrixType& m)
{
typedef typename MatrixType::Scalar Scalar;
const int PacketSize = internal::packet_traits<Scalar>::size;
const int PacketSize1 = PacketSize>1 ? PacketSize-1 : 1;
Index rows = m.rows();
Index cols = m.cols();
@ -132,9 +135,41 @@ void zero_sized_objects(const MatrixType& m)
res = b*a;
VERIFY(res.rows()==0 && res.cols()==cols);
}
{
Matrix<Scalar,PacketSize,0> a;
Matrix<Scalar,0,1> b;
Matrix<Scalar,PacketSize,1> res;
VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
}
{
Matrix<Scalar,PacketSize1,0> a;
Matrix<Scalar,0,1> b;
Matrix<Scalar,PacketSize1,1> res;
VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
}
{
Matrix<Scalar,PacketSize,Dynamic> a(PacketSize,0);
Matrix<Scalar,Dynamic,1> b(0,1);
Matrix<Scalar,PacketSize,1> res;
VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize,1) );
VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize,1) );
}
{
Matrix<Scalar,PacketSize1,Dynamic> a(PacketSize1,0);
Matrix<Scalar,Dynamic,1> b(0,1);
Matrix<Scalar,PacketSize1,1> res;
VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(PacketSize1,1) );
VERIFY_IS_APPROX( (res=a.lazyProduct(b)), MatrixType::Zero(PacketSize1,1) );
}
}
template<int>
void bug_127()
{
// Bug 127
@ -159,6 +194,7 @@ void bug_127()
a*b;
}
template<int>
void unaligned_objects()
{
// Regression test for the bug reported here:
@ -188,6 +224,29 @@ void unaligned_objects()
}
}
template<typename T>
EIGEN_DONT_INLINE
Index test_compute_block_size(Index m, Index n, Index k)
{
Index mc(m), nc(n), kc(k);
internal::computeProductBlockingSizes<T,T>(kc, mc, nc);
return kc+mc+nc;
}
template<typename T>
Index compute_block_size()
{
Index ret = 0;
ret += test_compute_block_size<T>(0,1,1);
ret += test_compute_block_size<T>(1,0,1);
ret += test_compute_block_size<T>(1,1,0);
ret += test_compute_block_size<T>(0,0,1);
ret += test_compute_block_size<T>(0,1,0);
ret += test_compute_block_size<T>(1,0,0);
ret += test_compute_block_size<T>(0,0,0);
return ret;
}
void test_product_extra()
{
for(int i = 0; i < g_repeat; i++) {
@ -198,6 +257,9 @@ void test_product_extra()
CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
}
CALL_SUBTEST_5( bug_127() );
CALL_SUBTEST_6( unaligned_objects() );
CALL_SUBTEST_5( bug_127<0>() );
CALL_SUBTEST_6( unaligned_objects<0>() );
CALL_SUBTEST_7( compute_block_size<float>() );
CALL_SUBTEST_7( compute_block_size<double>() );
CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
}

View File

@ -25,6 +25,22 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
MatrixType A = MatrixType::Random(dim,dim),
B = MatrixType::Random(dim,dim);
// Regression test for bug 985: Randomly set rows or columns to zero
Index k=internal::random<Index>(0, dim-1);
switch(internal::random<int>(0,10)) {
case 0:
A.row(k).setZero(); break;
case 1:
A.col(k).setZero(); break;
case 2:
B.row(k).setZero(); break;
case 3:
B.col(k).setZero(); break;
default:
break;
}
RealQZ<MatrixType> qz(A,B);
VERIFY_IS_EQUAL(qz.info(), Success);

View File

@ -67,6 +67,9 @@ template<typename SparseMatrixType> void sparse_product()
VERIFY_IS_APPROX(m4 = m2*m3/s1, refMat4 = refMat2*refMat3/s1);
VERIFY_IS_APPROX(m4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
VERIFY_IS_APPROX(m4 = s2*m2*m3*s1, refMat4 = s2*refMat2*refMat3*s1);
VERIFY_IS_APPROX(m4 = (m2+m2)*m3, refMat4 = (refMat2+refMat2)*refMat3);
VERIFY_IS_APPROX(m4 = m2*m3.leftCols(cols/2), refMat4 = refMat2*refMat3.leftCols(cols/2));
VERIFY_IS_APPROX(m4 = m2*(m3+m3).leftCols(cols/2), refMat4 = refMat2*(refMat3+refMat3).leftCols(cols/2));
VERIFY_IS_APPROX(m4=(m2*m3).pruned(0), refMat4=refMat2*refMat3);
VERIFY_IS_APPROX(m4=(m2t.transpose()*m3).pruned(0), refMat4=refMat2t.transpose()*refMat3);
@ -194,7 +197,7 @@ template<typename SparseMatrixType> void sparse_product()
VERIFY_IS_APPROX(d3=d1*m2.transpose(), refM3=d1*refM2.transpose());
}
// test self-adjoint and traingular-view products
// test self-adjoint and triangular-view products
{
DenseMatrix b = DenseMatrix::Random(rows, rows);
DenseMatrix x = DenseMatrix::Random(rows, rows);

View File

@ -49,18 +49,39 @@ void svd_compare_to_full(const MatrixType& m,
unsigned int computationOptions,
const SvdType& referenceSvd)
{
typedef typename MatrixType::Index Index;
typedef typename MatrixType::RealScalar RealScalar;
Index rows = m.rows();
Index cols = m.cols();
Index diagSize = (std::min)(rows, cols);
RealScalar prec = test_precision<RealScalar>();
SvdType svd(m, computationOptions);
VERIFY_IS_APPROX(svd.singularValues(), referenceSvd.singularValues());
if(computationOptions & (ComputeFullV|ComputeThinV))
{
VERIFY( (svd.matrixV().adjoint()*svd.matrixV()).isIdentity(prec) );
VERIFY_IS_APPROX( svd.matrixV().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint(),
referenceSvd.matrixV().leftCols(diagSize) * referenceSvd.singularValues().asDiagonal() * referenceSvd.matrixV().leftCols(diagSize).adjoint());
}
if(computationOptions & (ComputeFullU|ComputeThinU))
{
VERIFY( (svd.matrixU().adjoint()*svd.matrixU()).isIdentity(prec) );
VERIFY_IS_APPROX( svd.matrixU().leftCols(diagSize) * svd.singularValues().cwiseAbs2().asDiagonal() * svd.matrixU().leftCols(diagSize).adjoint(),
referenceSvd.matrixU().leftCols(diagSize) * referenceSvd.singularValues().cwiseAbs2().asDiagonal() * referenceSvd.matrixU().leftCols(diagSize).adjoint());
}
// The following checks are not critical.
// For instance, with Dived&Conquer SVD, if only the factor 'V' is computedt then different matrix-matrix product implementation will be used
// and the resulting 'V' factor might be significantly different when the SVD decomposition is not unique, especially with single precision float.
++g_test_level;
if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU());
if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(svd.matrixU(), referenceSvd.matrixU().leftCols(diagSize));
if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV());
if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(svd.matrixV().cwiseAbs(), referenceSvd.matrixV().cwiseAbs());
if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(svd.matrixV(), referenceSvd.matrixV().leftCols(diagSize));
--g_test_level;
}
//
@ -85,33 +106,48 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions)
SvdType svd(m, computationOptions);
if(internal::is_same<RealScalar,double>::value) svd.setThreshold(1e-8);
else if(internal::is_same<RealScalar,float>::value) svd.setThreshold(1e-4);
else if(internal::is_same<RealScalar,float>::value) svd.setThreshold(2e-4);
SolutionType x = svd.solve(rhs);
// evaluate normal equation which works also for least-squares solutions
if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
{
// This test is not stable with single precision.
// This is probably because squaring m signicantly affects the precision.
VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
}
RealScalar residual = (m*x-rhs).norm();
// Check that there is no significantly better solution in the neighborhood of x
RealScalar rhs_norm = rhs.norm();
if(!test_isMuchSmallerThan(residual,rhs.norm()))
{
// ^^^ If the residual is very small, then we have an exact solution, so we are already good.
// evaluate normal equation which works also for least-squares solutions
if(internal::is_same<RealScalar,double>::value || svd.rank()==m.diagonal().size())
{
using std::sqrt;
// This test is not stable with single precision.
// This is probably because squaring m signicantly affects the precision.
if(internal::is_same<RealScalar,float>::value) ++g_test_level;
VERIFY_IS_APPROX(m.adjoint()*(m*x),m.adjoint()*rhs);
if(internal::is_same<RealScalar,float>::value) --g_test_level;
}
// Check that there is no significantly better solution in the neighborhood of x
for(Index k=0;k<x.rows();++k)
{
using std::abs;
SolutionType y(x);
y.row(k) = (1.+2*NumTraits<RealScalar>::epsilon())*x.row(k);
RealScalar residual_y = (m*y-rhs).norm();
VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
if(internal::is_same<RealScalar,float>::value) ++g_test_level;
VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
if(internal::is_same<RealScalar,float>::value) --g_test_level;
y.row(k) = (1.-2*NumTraits<RealScalar>::epsilon())*x.row(k);
residual_y = (m*y-rhs).norm();
VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
if(internal::is_same<RealScalar,float>::value) ++g_test_level;
VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
if(internal::is_same<RealScalar,float>::value) --g_test_level;
}
}
}

View File

@ -82,8 +82,10 @@ template<typename MatrixType> void swap(const MatrixType& m)
void test_swap()
{
int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization
CALL_SUBTEST_2( swap(Matrix4d()) ); // fixed size, possible vectorization
CALL_SUBTEST_3( swap(MatrixXd(3,3)) ); // dyn size, no vectorization
CALL_SUBTEST_4( swap(MatrixXf(30,30)) ); // dyn size, possible vectorization
CALL_SUBTEST_3( swap(MatrixXd(s,s)) ); // dyn size, no vectorization
CALL_SUBTEST_4( swap(MatrixXf(s,s)) ); // dyn size, possible vectorization
TEST_SET_BUT_UNUSED_VARIABLE(s)
}

View File

@ -9,7 +9,17 @@
#include "main.h"
typedef Matrix<float,8,1> Vector8f;
typedef Matrix<float, 6,1> Vector6f;
typedef Matrix<float, 8,1> Vector8f;
typedef Matrix<float, 12,1> Vector12f;
typedef Matrix<double, 5,1> Vector5d;
typedef Matrix<double, 6,1> Vector6d;
typedef Matrix<double, 7,1> Vector7d;
typedef Matrix<double, 8,1> Vector8d;
typedef Matrix<double, 9,1> Vector9d;
typedef Matrix<double,10,1> Vector10d;
typedef Matrix<double,12,1> Vector12d;
struct TestNew1
{
@ -85,6 +95,9 @@ void unalignedassert()
construct_at_boundary<Vector2f>(4);
construct_at_boundary<Vector3f>(4);
construct_at_boundary<Vector4f>(16);
construct_at_boundary<Vector6f>(4);
construct_at_boundary<Vector8f>(EIGEN_ALIGN_BYTES);
construct_at_boundary<Vector12f>(16);
construct_at_boundary<Matrix2f>(16);
construct_at_boundary<Matrix3f>(4);
construct_at_boundary<Matrix4f>(EIGEN_ALIGN_BYTES);
@ -92,6 +105,13 @@ void unalignedassert()
construct_at_boundary<Vector2d>(16);
construct_at_boundary<Vector3d>(4);
construct_at_boundary<Vector4d>(EIGEN_ALIGN_BYTES);
construct_at_boundary<Vector5d>(4);
construct_at_boundary<Vector6d>(16);
construct_at_boundary<Vector7d>(4);
construct_at_boundary<Vector8d>(EIGEN_ALIGN_BYTES);
construct_at_boundary<Vector9d>(4);
construct_at_boundary<Vector10d>(16);
construct_at_boundary<Vector12d>(EIGEN_ALIGN_BYTES);
construct_at_boundary<Matrix2d>(EIGEN_ALIGN_BYTES);
construct_at_boundary<Matrix3d>(4);
construct_at_boundary<Matrix4d>(EIGEN_ALIGN_BYTES);
@ -115,7 +135,14 @@ void unalignedassert()
if(EIGEN_ALIGN_BYTES>=16)
{
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
}

View File

@ -81,8 +81,8 @@
#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"

View File

@ -266,16 +266,16 @@ array<t, n> repeat(t v) {
}
template<std::size_t I, class Head, class Tail>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
return get<I, type_list<Head, Tail> >::value;
}
template<std::size_t I, class Head, class Tail>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
return get<I, type_list<Head, Tail> >::value;
}
template <class NList>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
return arg_prod<NList>::value;
}

View File

@ -1157,7 +1157,41 @@ in TensorFunctors.h for information on how to implement a reduction operator.
## Convolutions
TBD: convolve(const KernelDerived& kernel, const Dimensions& dims)
### &lt;Operation&gt; convolve(const Kernel& kernel, const Dimensions& dims)
Returns a tensor that is the output of the convolution of the input tensor with the kernel,
along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
which were part of the convolution will be reduced by the formula:
output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
The dimension sizes for dimensions that were not part of the convolution will remain the same.
Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
for the last dimension).
// Compute convolution along the second and third dimension.
Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
Tensor<float, 2, DataLayout> kernel(2, 2);
Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
input.setRandom();
kernel.setRandom();
Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution.
output = input.convolve(kernel, dims);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 6; ++k) {
for (int l = 0; l < 11; ++l) {
const float result = output(i,j,k,l);
const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
input(i,j+1,k+0,l) * kernel(1,0) +
input(i,j+0,k+1,l) * kernel(0,1) +
input(i,j+1,k+1,l) * kernel(1,1);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
## Geometrical Operations

View File

@ -21,8 +21,8 @@ namespace Eigen {
*/
namespace internal {
template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
class IndexMapper {
public:
IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
const array<Index, NumKernelDims>& indices) {
@ -38,13 +38,19 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM
array<Index, NumDims> inputStrides;
array<Index, NumDims> outputStrides;
for (int i = 0; i < NumDims; ++i) {
if (i > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
inputStrides[0] = 1;
outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
} else {
inputStrides[0] = 1;
outputStrides[0] = 1;
}
} else {
inputStrides[NumDims - 1] = 1;
outputStrides[NumDims - 1] = 1;
for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
}
}
@ -52,13 +58,20 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM
array<Index, NumDims> cudaOutputDimensions;
array<Index, NumDims> tmp = dimensions;
array<Index, NumDims> ordering;
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
for (int i = 0; i < NumKernelDims; ++i) {
ordering[i] = indices[i];
const Index index = i + offset;
ordering[index] = indices[i];
tmp[indices[i]] = -1;
cudaInputDimensions[i] = input_dims[ordering[i]];
cudaOutputDimensions[i] = dimensions[ordering[i]];
cudaInputDimensions[index] = input_dims[indices[i]];
cudaOutputDimensions[index] = dimensions[indices[i]];
}
int written = NumKernelDims;
int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? NumKernelDims
: 0;
for (int i = 0; i < NumDims; ++i) {
if (tmp[i] >= 0) {
ordering[written] = i;
@ -73,61 +86,123 @@ template <typename Index, typename InputDims, size_t NumKernelDims> class IndexM
m_outputStrides[i] = outputStrides[ordering[i]];
}
for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) {
m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
} else {
m_cudaInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) {
m_cudaInputStrides[i] =
m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
m_cudaOutputStrides[i] =
m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
} else {
m_cudaInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1;
}
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
if (i + 1 < offset) {
m_cudaInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
m_cudaOutputStrides[i] =
m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
} else {
m_cudaInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1;
}
}
}
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
Index inputIndex = 0;
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaInputStrides[d];
inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d];
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaInputStrides[d];
inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d];
}
inputIndex += p * m_inputStrides[NumKernelDims];
} else {
int limit = 0;
if (NumKernelDims < NumDims) {
limit = NumDims - NumKernelDims - 1;
}
for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaInputStrides[d];
inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d];
}
inputIndex += p * m_inputStrides[limit];
}
inputIndex += p * m_inputStrides[NumKernelDims];
return inputIndex;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
Index outputIndex = 0;
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d];
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d];
}
outputIndex += p * m_outputStrides[NumKernelDims];
} else {
int limit = 0;
if (NumKernelDims < NumDims) {
limit = NumDims - NumKernelDims - 1;
}
for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d];
}
outputIndex += p * m_outputStrides[limit];
}
outputIndex += p * m_outputStrides[NumKernelDims];
return outputIndex;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
return i * m_inputStrides[0];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_inputStrides[offset];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
return i * m_outputStrides[0];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_outputStrides[offset];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
return i * m_inputStrides[0] + j*m_inputStrides[1];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
return i * m_outputStrides[0] + j * m_outputStrides[1];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
k * m_inputStrides[offset + 2];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
k * m_outputStrides[offset + 2];
}
private:
@ -237,35 +312,61 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
m_inputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
}
} else {
m_inputStride[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
}
}
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i > 0) {
m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
} else {
m_kernelStride[0] = 1;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i > 0) {
m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
} else {
m_kernelStride[0] = 1;
}
m_indexStride[i] = m_inputStride[index];
}
m_indexStride[i] = m_inputStride[index];
}
m_outputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
m_outputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
}
} else {
for (int i = NumKernelDims - 1; i >= 0; --i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i < NumKernelDims - 1) {
m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
} else {
m_kernelStride[NumKernelDims - 1] = 1;
}
m_indexStride[i] = m_inputStride[index];
}
m_outputStride[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
}
}
}
@ -310,13 +411,24 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
Index indices[2] = {index, index+PacketSize-1};
Index startInputs[2] = {0, 0};
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStride[i];
const Index idx1 = indices[1] / m_outputStride[i];
startInputs[0] += idx0 * m_inputStride[i];
startInputs[1] += idx1 * m_inputStride[i];
indices[0] -= idx0 * m_outputStride[i];
indices[1] -= idx1 * m_outputStride[i];
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStride[i];
const Index idx1 = indices[1] / m_outputStride[i];
startInputs[0] += idx0 * m_inputStride[i];
startInputs[1] += idx1 * m_inputStride[i];
indices[0] -= idx0 * m_outputStride[i];
indices[1] -= idx1 * m_outputStride[i];
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_outputStride[i];
const Index idx1 = indices[1] / m_outputStride[i];
startInputs[0] += idx0 * m_inputStride[i];
startInputs[1] += idx1 * m_inputStride[i];
indices[0] -= idx0 * m_outputStride[i];
indices[1] -= idx1 * m_outputStride[i];
}
}
startInputs[0] += indices[0];
startInputs[1] += indices[1];
@ -344,10 +456,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
Index startInput = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStride[i];
startInput += idx * m_inputStride[i];
index -= idx * m_outputStride[i];
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStride[i];
startInput += idx * m_inputStride[i];
index -= idx * m_outputStride[i];
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStride[i];
startInput += idx * m_inputStride[i];
index -= idx * m_outputStride[i];
}
}
startInput += index;
return startInput;
@ -378,7 +498,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
}
EIGEN_STRONG_INLINE void preloadKernel() {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
// expression that needs to be evaluated)
const Scalar* in_place = m_kernelImpl.data();
@ -431,11 +551,14 @@ struct GetKernelSize<Dynamic> {
}
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSize>
__global__ void EigenConvolutionKernel1D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) {
extern __shared__ float s[];
const int first_x = blockIdx.x * maxX;
@ -453,7 +576,7 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
}
__syncthreads();
@ -476,9 +599,15 @@ __global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::In
}
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSizeX, int StaticKernelSizeY>
__global__ void EigenConvolutionKernel2D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) {
extern __shared__ float s[];
const int first_x = blockIdx.x * maxX;
@ -538,9 +667,15 @@ __global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::In
}
};
template <typename InputEvaluator, typename Index, typename InputDims>
__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
__global__ void EigenConvolutionKernel3D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
indexMapper,
const float* __restrict kernel, const size_t numPlanes, const size_t numX,
const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) {
extern __shared__ float s[];
// Load inputs to shared memory
@ -622,8 +757,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
@ -712,10 +845,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
const int numX = dimensions()[m_indices[0]];
const int numP = dimensions().TotalSize() / numX;
int maxX;
dim3 block_size;
if (m_indices[0] == 0) {
const int single_stride_dim =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: m_inputImpl.dimensions().rank() - 1;
if (m_indices[0] == single_stride_dim) {
// Maximum the reuse
const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
maxX = (std::min<int>)(inner_dim, numX);
@ -747,7 +884,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
const array<Index, 1> indices(m_indices[0]);
const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices);
switch(kernel_size) {
case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
@ -765,11 +903,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
case 2: {
const int kernel_size_x = m_kernelImpl.dimensions()[0];
const int kernel_size_y = m_kernelImpl.dimensions()[1];
const int idxX =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
const int idxY =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
const int numX = dimensions()[m_indices[0]];
const int numY = dimensions()[m_indices[1]];
const int numX = dimensions()[m_indices[idxX]];
const int numY = dimensions()[m_indices[idxY]];
const int numP = dimensions().TotalSize() / (numX*numY);
const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
@ -798,9 +940,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
//cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 2> indices(m_indices[0], m_indices[1]);
const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
m_kernelImpl.dimensions()[idxY]);
internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices);
switch (kernel_size_x) {
case 4: {
switch (kernel_size_y) {
@ -837,13 +981,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
case 3: {
const int kernel_size_x = m_kernelImpl.dimensions()[0];
const int kernel_size_y = m_kernelImpl.dimensions()[1];
const int kernel_size_z = m_kernelImpl.dimensions()[2];
const int idxX =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
const int idxY =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
const int idxZ =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
const int numX = dimensions()[m_indices[0]];
const int numY = dimensions()[m_indices[1]];
const int numZ = dimensions()[m_indices[2]];
const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
const int numX = dimensions()[m_indices[idxX]];
const int numY = dimensions()[m_indices[idxY]];
const int numZ = dimensions()[m_indices[idxZ]];
const int numP = dimensions().TotalSize() / (numX*numY*numZ);
const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
@ -860,16 +1011,20 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
assert(shared_mem <= maxSharedMem);
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
m_indices[idxZ]);
const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
m_kernelImpl.dimensions()[idxY],
m_kernelImpl.dimensions()[idxZ]);
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices);
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break;
}
default: {
assert(false && "not supported yet");
EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
}
}
}

View File

@ -32,8 +32,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
@ -44,8 +43,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
@ -56,8 +54,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
Difference difference(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Difference> Assign;
Assign assign(m_expression, difference);
static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
@ -76,8 +73,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
return *this;
}
@ -88,8 +84,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
return *this;
}
@ -100,8 +95,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
Difference difference(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Difference> Assign;
Assign assign(m_expression, difference);
static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
return *this;
}
@ -122,7 +116,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
return *this;
}
@ -133,7 +127,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
return *this;
}
@ -144,14 +138,13 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
Difference difference(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Difference> Assign;
Assign assign(m_expression, difference);
static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess;
internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device);
internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
return *this;
}
protected:
const GpuDevice& m_device;
ExpressionType m_expression;
ExpressionType& m_expression;
};
#endif

View File

@ -145,39 +145,39 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
Sizes() { }
template <typename DenseIndex>
explicit Sizes(const array<DenseIndex, Base::count>& indices) {
explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
explicit Sizes(std::initializer_list<std::size_t> l) {
template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
explicit Sizes(std::initializer_list<std::size_t>) {
// todo: add assertion
}
#else
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
#endif
template <typename T> Sizes& operator = (const T& other) {
template <typename T> Sizes& operator = (const T&) {
// to do: check the size of other
return *this;
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this);
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this);
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
}
};
@ -343,7 +343,7 @@ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::s
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
};
template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
}

View File

@ -352,11 +352,12 @@ template<typename IfArgType, typename ThenArgType, typename ElseArgType, typenam
struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
{
typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
TensorEvaluator<IfArgType>::PacketAccess*/,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
internal::packet_traits<Scalar>::HasBlend,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
@ -373,7 +374,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename internal::traits<XprType>::Packet PacketReturnType;
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
@ -403,7 +403,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
internal::Selector<PacketSize> select;
for (Index i = 0; i < PacketSize; ++i) {
select.select[i] = m_condImpl.coeff(index+i);

View File

@ -14,15 +14,16 @@
using Eigen::Tensor;
using Eigen::DefaultDevice;
template <int DataLayout>
static void test_evals()
{
Tensor<float, 2> input(3, 3);
Tensor<float, 1> kernel(2);
Tensor<float, 2, DataLayout> input(3, 3);
Tensor<float, 1, DataLayout> kernel(2);
input.setRandom();
kernel.setRandom();
Tensor<float, 2> result(2,3);
Tensor<float, 2, DataLayout> result(2,3);
result.setZero();
Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
@ -41,16 +42,16 @@ static void test_evals()
VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
}
template <int DataLayout>
static void test_expr()
{
Tensor<float, 2> input(3, 3);
Tensor<float, 2> kernel(2, 2);
Tensor<float, 2, DataLayout> input(3, 3);
Tensor<float, 2, DataLayout> kernel(2, 2);
input.setRandom();
kernel.setRandom();
Tensor<float, 2> result(2,2);
Eigen::array<ptrdiff_t, 2> dims{{0, 1}};
Tensor<float, 2, DataLayout> result(2,2);
Eigen::array<ptrdiff_t, 2> dims({0, 1});
result = input.convolve(kernel, dims);
VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
@ -63,10 +64,10 @@ static void test_expr()
input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
}
template <int DataLayout>
static void test_modes() {
Tensor<float, 1> input(3);
Tensor<float, 1> kernel(3);
Tensor<float, 1, DataLayout> input(3);
Tensor<float, 1, DataLayout> kernel(3);
input(0) = 1.0f;
input(1) = 2.0f;
input(2) = 3.0f;
@ -74,13 +75,13 @@ static void test_modes() {
kernel(1) = 1.0f;
kernel(2) = 0.0f;
const Eigen::array<ptrdiff_t, 1> dims{{0}};
const Eigen::array<ptrdiff_t, 1> dims({0});
Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
// Emulate VALID mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(0, 0);
Tensor<float, 1> valid(1);
Tensor<float, 1, DataLayout> valid(1);
valid = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(valid.dimension(0), 1);
VERIFY_IS_APPROX(valid(0), 2.5f);
@ -88,7 +89,7 @@ static void test_modes() {
// Emulate SAME mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(1, 1);
Tensor<float, 1> same(3);
Tensor<float, 1, DataLayout> same(3);
same = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(same.dimension(0), 3);
VERIFY_IS_APPROX(same(0), 1.0f);
@ -98,7 +99,7 @@ static void test_modes() {
// Emulate FULL mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(2, 2);
Tensor<float, 1> full(5);
Tensor<float, 1, DataLayout> full(5);
full = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(full.dimension(0), 5);
VERIFY_IS_APPROX(full(0), 0.0f);
@ -108,18 +109,18 @@ static void test_modes() {
VERIFY_IS_APPROX(full(4), 1.5f);
}
template <int DataLayout>
static void test_strides() {
Tensor<float, 1> input(13);
Tensor<float, 1> kernel(3);
Tensor<float, 1, DataLayout> input(13);
Tensor<float, 1, DataLayout> kernel(3);
input.setRandom();
kernel.setRandom();
const Eigen::array<ptrdiff_t, 1> dims{{0}};
const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
const Eigen::array<ptrdiff_t, 1> dims({0});
const Eigen::array<ptrdiff_t, 1> stride_of_3({3});
const Eigen::array<ptrdiff_t, 1> stride_of_2({2});
Tensor<float, 1> result;
Tensor<float, 1, DataLayout> result;
result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
VERIFY_IS_EQUAL(result.dimension(0), 2);
@ -129,13 +130,14 @@ static void test_strides() {
input(12)*kernel(2)));
}
void test_cxx11_tensor_convolution()
{
CALL_SUBTEST(test_evals());
CALL_SUBTEST(test_expr());
CALL_SUBTEST(test_modes());
CALL_SUBTEST(test_strides());
CALL_SUBTEST(test_evals<ColMajor>());
CALL_SUBTEST(test_evals<RowMajor>());
CALL_SUBTEST(test_expr<ColMajor>());
CALL_SUBTEST(test_expr<RowMajor>());
CALL_SUBTEST(test_modes<ColMajor>());
CALL_SUBTEST(test_modes<RowMajor>());
CALL_SUBTEST(test_strides<ColMajor>());
CALL_SUBTEST(test_strides<RowMajor>());
}

View File

@ -117,11 +117,10 @@ void test_cuda_elementwise()
}
}
void test_cuda_reduction()
{
Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113));
Tensor<float, 2> out(Eigen::array<int, 2>(72,97));
Tensor<float, 4> in1(72,53,97,113);
Tensor<float, 2> out(72,97);
in1.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
@ -138,8 +137,8 @@ void test_cuda_reduction()
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113));
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97));
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
array<int, 2> reduction_axis;
reduction_axis[0] = 1;
@ -156,10 +155,10 @@ void test_cuda_reduction()
for (int k = 0; k < 53; ++k) {
for (int l = 0; l < 113; ++l) {
expected =
std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l)));
std::max<float>(expected, in1(i, k, j, l));
}
}
VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected);
VERIFY_IS_APPROX(out(i,j), expected);
}
}
}
@ -170,7 +169,7 @@ static void test_cuda_contraction()
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31));
Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
@ -196,12 +195,9 @@ static void test_cuda_contraction()
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> >
gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31));
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1));
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1));
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
MapXf m_left(t_left.data(), 300, 93);
@ -226,11 +222,12 @@ static void test_cuda_contraction()
}
}
template<int DataLayout>
static void test_cuda_convolution_1d()
{
Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
Tensor<float, 1> kernel(Eigen::array<int, 1>(4));
Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137));
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 1, DataLayout> kernel(4);
Tensor<float, 4, DataLayout> out(74,34,11,137);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
@ -252,9 +249,9 @@ static void test_cuda_convolution_1d()
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4));
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137));
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
Eigen::array<int, 1> dims(1);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -266,11 +263,9 @@ static void test_cuda_convolution_1d()
for (int j = 0; j < 34; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(Eigen::array<int, 4>(i,j,k,l));
const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) +
input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) +
input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) +
input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3));
const float result = out(i,j,k,l);
const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
@ -278,12 +273,11 @@ static void test_cuda_convolution_1d()
}
}
static void test_cuda_convolution_2d()
static void test_cuda_convolution_inner_dim_col_major_1d()
{
Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4));
Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137));
Tensor<float, 4, ColMajor> input(74,9,11,7);
Tensor<float, 1, ColMajor> kernel(4);
Tensor<float, 4, ColMajor> out(71,9,11,7);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
@ -305,9 +299,110 @@ static void test_cuda_convolution_2d()
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4));
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137));
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
Eigen::array<int, 1> dims(0);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 71; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 7; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
static void test_cuda_convolution_inner_dim_row_major_1d()
{
Tensor<float, 4, RowMajor> input(7,9,11,74);
Tensor<float, 1, RowMajor> kernel(4);
Tensor<float, 4, RowMajor> out(7,9,11,71);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
cudaMalloc((void**)(&d_input), input_bytes);
cudaMalloc((void**)(&d_kernel), kernel_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
Eigen::array<int, 1> dims(3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 7; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 71; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
template<int DataLayout>
static void test_cuda_convolution_2d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 2, DataLayout> kernel(3,4);
Tensor<float, 4, DataLayout> out(74,35,8,137);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
cudaMalloc((void**)(&d_input), input_bytes);
cudaMalloc((void**)(&d_kernel), kernel_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
Eigen::array<int, 2> dims(1,2);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -319,32 +414,32 @@ static void test_cuda_convolution_2d()
for (int j = 0; j < 35; ++j) {
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(Eigen::array<int, 4>(i,j,k,l));
const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) +
input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) +
input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) +
input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) +
input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) +
input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) +
input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) +
input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) +
input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) +
input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) +
input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) +
input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3));
VERIFY_IS_APPROX(result, expected);
const float result = out(i,j,k,l);
const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
input(i,j+1,k+0,l) * kernel(1,0) +
input(i,j+2,k+0,l) * kernel(2,0) +
input(i,j+0,k+1,l) * kernel(0,1) +
input(i,j+1,k+1,l) * kernel(1,1) +
input(i,j+2,k+1,l) * kernel(2,1) +
input(i,j+0,k+2,l) * kernel(0,2) +
input(i,j+1,k+2,l) * kernel(1,2) +
input(i,j+2,k+2,l) * kernel(2,2) +
input(i,j+0,k+3,l) * kernel(0,3) +
input(i,j+1,k+3,l) * kernel(1,3) +
input(i,j+2,k+3,l) * kernel(2,3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
template<int DataLayout>
static void test_cuda_convolution_3d()
{
Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17));
Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2));
Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17));
Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
Tensor<float, 3, DataLayout> kernel(3,4,2);
Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
@ -366,9 +461,9 @@ static void test_cuda_convolution_3d()
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2));
Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17));
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
Eigen::array<int, 3> dims(1,2,3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
@ -381,31 +476,31 @@ static void test_cuda_convolution_3d()
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 136; ++l) {
for (int m = 0; m < 17; ++m) {
const float result = out(Eigen::array<int, 5>(i,j,k,l,m));
const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) +
input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) +
input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) +
input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) +
input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) +
input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) +
input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) +
input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) +
input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) +
input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) +
input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) +
input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) +
input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) +
input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) +
input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) +
input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) +
input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) +
input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) +
input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) +
input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) +
input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) +
input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) +
input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) +
input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1));
const float result = out(i,j,k,l,m);
const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
VERIFY_IS_APPROX(result, expected);
}
}
@ -414,91 +509,6 @@ static void test_cuda_convolution_3d()
}
}
static float* CudaCopyFloat(float* data, int size) {
const int nbytes = size * sizeof(float);
float* result = NULL;
if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) {
return NULL;
} else {
if (data != NULL) {
cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice);
}
return result;
}
}
static void test_cuda_constant_broadcast()
{
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Tensor<float, 1> t1(10);
for (int i = 0; i < 10; ++i) {
t1(i) = 10.0f * i;
}
float* t1_cuda = CudaCopyFloat(t1.data(), t1.size());
Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10);
Tensor<float, 1> t2(1);
t2 = t2.constant(20.0f);
float* t2_cuda = CudaCopyFloat(t2.data(), t2.size());
Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1);
float* t3_cuda = CudaCopyFloat(NULL, 10);
Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10);
t3_gpu.device(gpu_device) =
t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10));
Eigen::Tensor<float, 1> t3(10);
cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float),
cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
}
}
void test_cuda_cast()
{
Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97));
Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
in.setRandom();
std::size_t in_bytes = in.size() * sizeof(double);
std::size_t out_bytes = out.size() * sizeof(float);
double* d_in;
float* d_out;
cudaMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
gpu_out.device(gpu_device) = gpu_in.template cast<float>();
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) {
VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k))));
}
}
}
}
void test_cxx11_tensor_cuda()
{
CALL_SUBTEST(test_cuda_elementwise_small());
@ -506,9 +516,12 @@ void test_cxx11_tensor_cuda()
CALL_SUBTEST(test_cuda_reduction());
CALL_SUBTEST(test_cuda_contraction<ColMajor>());
CALL_SUBTEST(test_cuda_contraction<RowMajor>());
CALL_SUBTEST(test_cuda_convolution_1d());
CALL_SUBTEST(test_cuda_convolution_2d());
CALL_SUBTEST(test_cuda_convolution_3d());
CALL_SUBTEST(test_cuda_constant_broadcast());
CALL_SUBTEST(test_cuda_cast());
CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
}