Pull the latest updates from trunk

This commit is contained in:
Benoit Steiner 2016-10-05 14:54:36 -07:00
commit ae1385c7e4
59 changed files with 1930 additions and 1250 deletions

View File

@ -411,6 +411,10 @@ using std::ptrdiff_t;
#include "src/Core/functors/StlFunctors.h" #include "src/Core/functors/StlFunctors.h"
#include "src/Core/functors/AssignmentFunctors.h" #include "src/Core/functors/AssignmentFunctors.h"
// Specialized functors to enable the processing of complex numbers
// on CUDA devices
#include "src/Core/arch/CUDA/Complex.h"
#include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseCoeffsBase.h"
#include "src/Core/DenseBase.h" #include "src/Core/DenseBase.h"
#include "src/Core/MatrixBase.h" #include "src/Core/MatrixBase.h"

View File

@ -87,6 +87,7 @@ template<typename Derived> class ArrayBase
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseUnaryOps.h"
# include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h"
# include "../plugins/ArrayCwiseUnaryOps.h" # include "../plugins/ArrayCwiseUnaryOps.h"
@ -97,6 +98,7 @@ template<typename Derived> class ArrayBase
# include EIGEN_ARRAYBASE_PLUGIN # include EIGEN_ARRAYBASE_PLUGIN
# endif # endif
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
#undef EIGEN_DOC_UNARY_ADDONS
/** Special case of the template operator=, in order to prevent the compiler /** Special case of the template operator=, in order to prevent the compiler
* from generating a default operator= (issue hit with g++ 4.1) * from generating a default operator= (issue hit with g++ 4.1)

View File

@ -820,7 +820,8 @@ struct mapbase_evaluator : evaluator_base<Derived>
EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
: m_data(const_cast<PointerType>(map.data())), : m_data(const_cast<PointerType>(map.data())),
m_xpr(map) m_innerStride(map.innerStride()),
m_outerStride(map.outerStride())
{ {
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1), EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
@ -830,32 +831,32 @@ struct mapbase_evaluator : evaluator_base<Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index row, Index col) const CoeffReturnType coeff(Index row, Index col) const
{ {
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; return m_data[col * colStride() + row * rowStride()];
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const CoeffReturnType coeff(Index index) const
{ {
return m_data[index * m_xpr.innerStride()]; return m_data[index * m_innerStride.value()];
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index row, Index col) Scalar& coeffRef(Index row, Index col)
{ {
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; return m_data[col * colStride() + row * rowStride()];
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index) Scalar& coeffRef(Index index)
{ {
return m_data[index * m_xpr.innerStride()]; return m_data[index * m_innerStride.value()];
} }
template<int LoadMode, typename PacketType> template<int LoadMode, typename PacketType>
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
PacketType packet(Index row, Index col) const PacketType packet(Index row, Index col) const
{ {
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::ploadt<PacketType, LoadMode>(ptr); return internal::ploadt<PacketType, LoadMode>(ptr);
} }
@ -863,14 +864,14 @@ struct mapbase_evaluator : evaluator_base<Derived>
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
PacketType packet(Index index) const PacketType packet(Index index) const
{ {
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride()); return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
} }
template<int StoreMode, typename PacketType> template<int StoreMode, typename PacketType>
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
void writePacket(Index row, Index col, const PacketType& x) void writePacket(Index row, Index col, const PacketType& x)
{ {
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); PointerType ptr = m_data + row * rowStride() + col * colStride();
return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x); return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
} }
@ -878,12 +879,17 @@ struct mapbase_evaluator : evaluator_base<Derived>
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x) void writePacket(Index index, const PacketType& x)
{ {
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x); internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
} }
protected: protected:
EIGEN_DEVICE_FUNC
inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
EIGEN_DEVICE_FUNC
inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
PointerType m_data; PointerType m_data;
const XprType& m_xpr; const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
}; };
template<typename PlainObjectType, int MapOptions, typename StrideType> template<typename PlainObjectType, int MapOptions, typename StrideType>

View File

@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
* *
* The function generates 'size' equally spaced values in the closed interval [low,high]. * The function generates 'size' equally spaced values in the closed interval [low,high].
* This particular version of LinSpaced() uses sequential access, i.e. vector access is * This particular version of LinSpaced() uses sequential access, i.e. vector access is
* assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
* and yields faster code than the random access version. * and yields faster code than the random access version.
* *
* When size is set to 1, a vector of length 1 containing 'high' is returned. * When size is set to 1, a vector of length 1 containing 'high' is returned.
@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
/** /**
* \brief Sets a linearly spaced vector. * \brief Sets a linearly spaced vector.
* *
* The function fill *this with equally spaced values in the closed interval [low,high]. * The function fills *this with equally spaced values in the closed interval [low,high].
* When size is set to 1, a vector of length 1 containing 'high' is returned. * When size is set to 1, a vector of length 1 containing 'high' is returned.
* *
* \only_for_vectors * \only_for_vectors

View File

@ -558,12 +558,15 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC void reverseInPlace(); EIGEN_DEVICE_FUNC void reverseInPlace();
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
# include "../plugins/BlockMethods.h" # include "../plugins/BlockMethods.h"
# ifdef EIGEN_DENSEBASE_PLUGIN # ifdef EIGEN_DENSEBASE_PLUGIN
# include EIGEN_DENSEBASE_PLUGIN # include EIGEN_DENSEBASE_PLUGIN
# endif # endif
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
// disable the use of evalTo for dense objects with a nice compilation error // disable the use of evalTo for dense objects with a nice compilation error
template<typename Dest> template<typename Dest>

View File

@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
template<typename Scalar,int Size,int MaxSize> template<typename Scalar,int Size,int MaxSize>
struct gemv_static_vector_if<Scalar,Size,MaxSize,true> struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
{ {
#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
#else
// Some architectures cannot align on the stack,
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
enum { enum {
ForceAlignment = internal::packet_traits<Scalar>::Vectorizable, ForceAlignment = internal::packet_traits<Scalar>::Vectorizable,
PacketSize = internal::packet_traits<Scalar>::size PacketSize = internal::packet_traits<Scalar>::size
}; };
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data; #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
#else
// Some architectures cannot align on the stack,
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() { EIGEN_STRONG_INLINE Scalar* data() {
return ForceAlignment return ForceAlignment
? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES) ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
: m_data.array; : m_data.array;
} }
#endif #endif
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
typedef internal::blas_traits<Rhs> RhsBlasTraits; typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);

View File

@ -50,7 +50,7 @@ public:
typedef typename internal::ref_selector<Inverse>::type Nested; typedef typename internal::ref_selector<Inverse>::type Nested;
typedef typename internal::remove_all<XprType>::type NestedExpression; typedef typename internal::remove_all<XprType>::type NestedExpression;
explicit Inverse(const XprType &xpr) explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
: m_xpr(xpr) : m_xpr(xpr)
{} {}

View File

@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {}; template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
template<typename T>
struct real_impl<std::complex<T> >
{
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x)
{
return x.real();
}
};
#endif
template<typename Scalar> template<typename Scalar>
struct real_retval struct real_retval
{ {
@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {}; template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
template<typename T>
struct imag_impl<std::complex<T> >
{
typedef T RealScalar;
EIGEN_DEVICE_FUNC
static inline T run(const std::complex<T>& x)
{
return x.imag();
}
};
#endif
template<typename Scalar> template<typename Scalar>
struct imag_retval struct imag_retval
{ {
@ -1049,12 +1075,12 @@ double abs(const double &x) { return ::fabs(x); }
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const std::complex<float>& x) { float abs(const std::complex<float>& x) {
return ::hypotf(real(x), imag(x)); return ::hypotf(x.real(), x.imag());
} }
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double abs(const std::complex<double>& x) { double abs(const std::complex<double>& x) {
return ::hypot(real(x), imag(x)); return ::hypot(x.real(), x.imag());
} }
#endif #endif
@ -1312,11 +1338,12 @@ template<typename Scalar>
struct scalar_fuzzy_default_impl<Scalar, true, false> struct scalar_fuzzy_default_impl<Scalar, true, false>
{ {
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
template<typename OtherScalar> template<typename OtherScalar> EIGEN_DEVICE_FUNC
static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
{ {
return numext::abs2(x) <= numext::abs2(y) * prec * prec; return numext::abs2(x) <= numext::abs2(y) * prec * prec;
} }
EIGEN_DEVICE_FUNC
static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
{ {
return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec; return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;

View File

@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
// this range is +/-1.0f in single-precision. // this range is +/-1.0f in single-precision.
const T plus_9 = pset1<T>(9.f); const T plus_9 = pset1<T>(9.f);
const T minus_9 = pset1<T>(-9.f); const T minus_9 = pset1<T>(-9.f);
const T x = pmax(minus_9, pmin(plus_9, a_x)); // NOTE GCC prior to 6.3 might improperly optimize this max/min
// step such that if a_x is nan, x will be either 9 or -9,
// and tanh will return 1 or -1 instead of nan.
// This is supposed to be fixed in gcc6.3,
// see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
const T x = pmax(minus_9,pmin(plus_9,a_x));
// The monomial coefficients of the numerator polynomial (odd). // The monomial coefficients of the numerator polynomial (odd).
const T alpha_1 = pset1<T>(4.89352455891786e-03f); const T alpha_1 = pset1<T>(4.89352455891786e-03f);
const T alpha_3 = pset1<T>(6.37261928875436e-04f); const T alpha_3 = pset1<T>(6.37261928875436e-04f);

View File

@ -98,7 +98,7 @@ template<typename Derived> class MatrixBase
/** \returns the size of the main diagonal, which is min(rows(),cols()). /** \returns the size of the main diagonal, which is min(rows(),cols()).
* \sa rows(), cols(), SizeAtCompileTime. */ * \sa rows(), cols(), SizeAtCompileTime. */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline Index diagonalSize() const { return (std::min)(rows(),cols()); } inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
typedef typename Base::PlainObject PlainObject; typedef typename Base::PlainObject PlainObject;
@ -121,6 +121,7 @@ template<typename Derived> class MatrixBase
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseUnaryOps.h"
# include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h"
# include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h"
@ -129,6 +130,7 @@ template<typename Derived> class MatrixBase
# include EIGEN_MATRIXBASE_PLUGIN # include EIGEN_MATRIXBASE_PLUGIN
# endif # endif
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
#undef EIGEN_DOC_UNARY_ADDONS
/** Special case of the template operator=, in order to prevent the compiler /** Special case of the template operator=, in order to prevent the compiler
* from generating a default operator= (issue hit with g++ 4.1) * from generating a default operator= (issue hit with g++ 4.1)
@ -328,15 +330,11 @@ template<typename Derived> class MatrixBase
/////////// LU module /////////// /////////// LU module ///////////
EIGEN_DEVICE_FUNC
inline const FullPivLU<PlainObject> fullPivLu() const; inline const FullPivLU<PlainObject> fullPivLu() const;
EIGEN_DEVICE_FUNC
inline const PartialPivLU<PlainObject> partialPivLu() const; inline const PartialPivLU<PlainObject> partialPivLu() const;
EIGEN_DEVICE_FUNC
inline const PartialPivLU<PlainObject> lu() const; inline const PartialPivLU<PlainObject> lu() const;
EIGEN_DEVICE_FUNC
inline const Inverse<Derived> inverse() const; inline const Inverse<Derived> inverse() const;
template<typename ResultType> template<typename ResultType>

View File

@ -265,7 +265,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// FIXME not very good if rhs is real and lhs complex while alpha is real too // FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols(); const Index cols = dst.cols();
for (Index j=0; j<cols; ++j) for (Index j=0; j<cols; ++j)
func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs); func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
} }
// Row major result // Row major result
@ -278,7 +278,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// FIXME not very good if lhs is real and rhs complex while alpha is real too // FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows(); const Index rows = dst.rows();
for (Index i=0; i<rows; ++i) for (Index i=0; i<rows; ++i)
func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs); func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
} }
template<typename Lhs, typename Rhs> template<typename Lhs, typename Rhs>
@ -437,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost); EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost); EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
#if 0
std::cerr << "LhsOuterStrideBytes= " << LhsOuterStrideBytes << "\n";
std::cerr << "RhsOuterStrideBytes= " << RhsOuterStrideBytes << "\n";
std::cerr << "LhsAlignment= " << LhsAlignment << "\n";
std::cerr << "RhsAlignment= " << RhsAlignment << "\n";
std::cerr << "CanVectorizeLhs= " << CanVectorizeLhs << "\n";
std::cerr << "CanVectorizeRhs= " << CanVectorizeRhs << "\n";
std::cerr << "CanVectorizeInner= " << CanVectorizeInner << "\n";
std::cerr << "EvalToRowMajor= " << EvalToRowMajor << "\n";
std::cerr << "Alignment= " << Alignment << "\n";
std::cerr << "Flags= " << Flags << "\n";
#endif
} }
// Everything below here is taken from CoeffBasedProduct.h // Everything below here is taken from CoeffBasedProduct.h
@ -503,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment) Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
: bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment) : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
: 0, : 0,
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@ -590,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{ {
etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res); etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res); res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
} }
}; };
@ -600,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
{ {
etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res); etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
res = pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res); res = pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
} }
}; };
@ -609,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{ {
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col)); res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
} }
}; };
@ -618,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
{ {
res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col))); res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
} }
}; };
@ -627,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{ {
res = pset1<Packet>(0); res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
} }
}; };
@ -636,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
{ {
res = pset1<Packet>(0); res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
} }
}; };
@ -645,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{ {
res = pset1<Packet>(0); res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i) for(Index i = 0; i < innerDim; ++i)
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res); res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
} }
@ -656,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
{ {
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
{ {
res = pset1<Packet>(0); res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
for(Index i = 0; i < innerDim; ++i) for(Index i = 0; i < innerDim; ++i)
res = pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res); res = pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
} }

View File

@ -0,0 +1,103 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_CUDA_H
#define EIGEN_COMPLEX_CUDA_H
// clang-format off
namespace Eigen {
namespace internal {
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
// Sum
template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) + numext::real(b),
numext::imag(a) + numext::imag(b));
}
};
template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
// Difference
template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) - numext::real(b),
numext::imag(a) - numext::imag(b));
}
};
template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
// Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasMul
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
return std::complex<T>(a_real * b_real - a_imag * b_imag,
a_real * b_imag + a_imag * b_real);
}
};
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
// Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasDiv
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
(a_imag * b_real - a_real * b_imag) * norm);
}
};
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_CUDA_H

View File

@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
return __half2half2(from); return __half2half2(from);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
return *reinterpret_cast<const half2*>(from); return *reinterpret_cast<const half2*>(from);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[1]); return __halves2half2(from[0], from[1]);
} }
@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]); return __halves2half2(from[0], from[0]);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from; *reinterpret_cast<half2*>(to) = from;
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
to[0] = __low2half(from); to[0] = __low2half(from);
to[1] = __high2half(from); to[1] = __high2half(from);
} }
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
return __ldg((const half2*)from); return __ldg((const half2*)from);
#else #else
@ -76,7 +76,7 @@ template<>
} }
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1)); return __halves2half2(__ldg(from+0), __ldg(from+1));
#else #else
@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
return __halves2half2(from[0*stride], from[1*stride]); return __halves2half2(from[0*stride], from[1*stride]);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
to[stride*0] = __low2half(from); to[stride*0] = __low2half(from);
to[stride*1] = __high2half(from); to[stride*1] = __high2half(from);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
return __low2half(a); return __low2half(a);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result; half2 result;
result.x = a.x & 0x7FFF7FFF; result.x = a.x & 0x7FFF7FFF;
return result; return result;
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void __device__ EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) { ptranspose(PacketBlock<half2,2>& kernel) {
__half a1 = __low2half(kernel.packet[0]); __half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]); __half a2 = __high2half(kernel.packet[0]);
@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
kernel.packet[1] = __halves2half2(a2, b2); kernel.packet[1] = __halves2half2(a2, b2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f))); return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else #else
@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hadd2(a, b); return __hadd2(a, b);
#else #else
@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hsub2(a, b); return __hsub2(a, b);
#else #else
@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hneg2(a); return __hneg2(a);
#else #else
@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hmul2(a, b); return __hmul2(a, b);
#else #else
@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hfma2(a, b, c); return __hfma2(a, b, c);
#else #else
@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
return __halves2half2(r1, r2); return __halves2half2(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
return __halves2half2(r1, r2); return __halves2half2(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hadd(__low2half(a), __high2half(a)); return __hadd(__low2half(a), __high2half(a));
#else #else
@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
return __hmul(__low2half(a), __high2half(a)); return __hmul(__low2half(a), __high2half(a));
#else #else
@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
#endif #endif
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = log1pf(a1); float r1 = log1pf(a1);
@ -277,29 +277,29 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) { half2 plog<half2>(const half2& a) {
return h2log(a); return h2log(a);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 pexp<half2>(const half2& a) { half2 pexp<half2>(const half2& a) {
return h2exp(a); return h2exp(a);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 psqrt<half2>(const half2& a) { half2 psqrt<half2>(const half2& a) {
return h2sqrt(a); return h2sqrt(a);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 prsqrt<half2>(const half2& a) { half2 prsqrt<half2>(const half2& a) {
return h2rsqrt(a); return h2rsqrt(a);
} }
#else #else
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = logf(a1); float r1 = logf(a1);
@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = expf(a1); float r1 = expf(a1);
@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = sqrtf(a1); float r1 = sqrtf(a1);
@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = rsqrtf(a1); float r1 = rsqrtf(a1);

View File

@ -434,15 +434,16 @@ public:
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
{ {
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
// let gcc allocate the register in which to store the result of the pmul // let gcc allocate the register in which to store the result of the pmul
// (in the case where there is no FMA) gcc fails to figure out how to avoid // (in the case where there is no FMA) gcc fails to figure out how to avoid
// spilling register. // spilling register.
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp); EIGEN_UNUSED_VARIABLE(tmp);
c = pmadd(a,b,c); c = cj.pmadd(a,b,c);
#else #else
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
#endif #endif
} }
@ -457,9 +458,6 @@ public:
r = pmadd(c,alpha,r); r = pmadd(c,alpha,r);
} }
protected:
// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
}; };
template<typename RealScalar, bool _ConjLhs> template<typename RealScalar, bool _ConjLhs>

View File

@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
{ {
typedef typename Dest::Scalar ResScalar; typedef typename Dest::Scalar ResScalar;
typedef typename Rhs::Scalar RhsScalar; typedef typename Rhs::Scalar RhsScalar;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols()); eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());

View File

@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
typedef internal::blas_traits<Rhs> RhsBlasTraits; typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);

View File

@ -14,12 +14,13 @@
// 4512 - assignment operator could not be generated // 4512 - assignment operator could not be generated
// 4522 - 'class' : multiple assignment operators specified // 4522 - 'class' : multiple assignment operators specified
// 4700 - uninitialized local variable 'xyz' used // 4700 - uninitialized local variable 'xyz' used
// 4714 - function marked as __forceinline not inlined
// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning) // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
#pragma warning( push ) #pragma warning( push )
#endif #endif
#pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800) #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
#elif defined __INTEL_COMPILER #elif defined __INTEL_COMPILER
// 2196 - routine is both "inline" and "noinline" ("noinline" assumed) // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@ -67,6 +68,8 @@
#pragma diag_suppress 2669 #pragma diag_suppress 2669
#pragma diag_suppress 2670 #pragma diag_suppress 2670
#pragma diag_suppress 2671 #pragma diag_suppress 2671
#pragma diag_suppress 2735
#pragma diag_suppress 2737
#endif #endif
#endif // not EIGEN_WARNINGS_DISABLED #endif // not EIGEN_WARNINGS_DISABLED

View File

@ -13,7 +13,7 @@
#define EIGEN_WORLD_VERSION 3 #define EIGEN_WORLD_VERSION 3
#define EIGEN_MAJOR_VERSION 2 #define EIGEN_MAJOR_VERSION 2
#define EIGEN_MINOR_VERSION 93 #define EIGEN_MINOR_VERSION 94
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -954,8 +954,8 @@ namespace Eigen {
# define EIGEN_CATCH(X) catch (X) # define EIGEN_CATCH(X) catch (X)
#else #else
# ifdef __CUDA_ARCH__ # ifdef __CUDA_ARCH__
# define EIGEN_THROW_X(X) asm("trap;") return {} # define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;"); return {} # define EIGEN_THROW asm("trap;")
# else # else
# define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW_X(X) std::abort()
# define EIGEN_THROW std::abort() # define EIGEN_THROW std::abort()

View File

@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
destruct_elements_of_array(ptr, i); destruct_elements_of_array(ptr, i);
EIGEN_THROW; EIGEN_THROW;
} }
return NULL;
} }
/***************************************************************************** /*****************************************************************************
@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
aligned_free(result); aligned_free(result);
EIGEN_THROW; EIGEN_THROW;
} }
return result;
} }
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
conditional_aligned_free<Align>(result); conditional_aligned_free<Align>(result);
EIGEN_THROW; EIGEN_THROW;
} }
return result;
} }
/** \internal Deletes objects constructed with aligned_new /** \internal Deletes objects constructed with aligned_new

View File

@ -671,6 +671,14 @@ struct scalar_div_cost {
enum { value = 8*NumTraits<T>::MulCost }; enum { value = 8*NumTraits<T>::MulCost };
}; };
template<typename T,bool Vectorized>
struct scalar_div_cost<std::complex<T>, Vectorized> {
enum { value = 2*scalar_div_cost<T>::value
+ 6*NumTraits<T>::MulCost
+ 3*NumTraits<T>::AddCost
};
};
template<bool Vectorized> template<bool Vectorized>
struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; }; struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };

View File

@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
typedef AngleAxis<double> AngleAxisd; typedef AngleAxis<double> AngleAxisd;
/** Set \c *this from a \b unit quaternion. /** Set \c *this from a \b unit quaternion.
* The resulting axis is normalized. *
* The resulting axis is normalized, and the computed angle is in the [0,pi] range.
* *
* This function implicitly normalizes the quaternion \a q. * This function implicitly normalizes the quaternion \a q.
*/ */
@ -167,12 +168,16 @@ template<typename QuatDerived>
AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q) AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
{ {
using std::atan2; using std::atan2;
using std::abs;
Scalar n = q.vec().norm(); Scalar n = q.vec().norm();
if(n<NumTraits<Scalar>::epsilon()) if(n<NumTraits<Scalar>::epsilon())
n = q.vec().stableNorm(); n = q.vec().stableNorm();
if (n > Scalar(0))
if (n != Scalar(0))
{ {
m_angle = Scalar(2)*atan2(n, q.w()); m_angle = Scalar(2)*atan2(n, abs(q.w()));
if(q.w() < 0)
n = -n;
m_axis = q.vec() / n; m_axis = q.vec() / n;
} }
else else

View File

@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
res[0] = atan2(coeff(j,i), coeff(k,i)); res[0] = atan2(coeff(j,i), coeff(k,i));
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
{ {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm(); Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
res[1] = -atan2(s2, coeff(i,i)); res[1] = -atan2(s2, coeff(i,i));
} }
@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
res[0] = atan2(coeff(j,k), coeff(k,k)); res[0] = atan2(coeff(j,k), coeff(k,k));
Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm(); Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) { if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
res[1] = atan2(-coeff(i,k), -c2); res[1] = atan2(-coeff(i,k), -c2);
} }
else else

View File

@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
} }
}; };
// TODO: the following specialization is to address a regression from 3.2 to 3.3
// In the future, this path should be optimized.
template<typename Lhs, typename RhsArg, int ProductTag>
struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
{
template<typename Dest>
static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
{
dst.noalias() = lhs * rhs.eval();
}
};
template<typename Lhs,typename Rhs> template<typename Lhs,typename Rhs>
struct homogeneous_left_product_refactoring_helper struct homogeneous_left_product_refactoring_helper
{ {

View File

@ -464,7 +464,7 @@ public:
operator * (const DiagonalBase<DiagonalDerived> &b) const operator * (const DiagonalBase<DiagonalDerived> &b) const
{ {
TransformTimeDiagonalReturnType res(*this); TransformTimeDiagonalReturnType res(*this);
res.linear() *= b; res.linearExt() *= b;
return res; return res;
} }
@ -578,7 +578,7 @@ public:
return res; return res;
} }
inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; } inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
template<typename Derived> template<typename Derived>
inline Transform& operator=(const RotationBase<Derived,Dim>& r); inline Transform& operator=(const RotationBase<Derived,Dim>& r);
@ -853,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
{ {
EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim)) EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS) EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0)); affine().noalias() = (other.asDiagonal() * affine());
return *this; return *this;
} }

View File

@ -130,8 +130,10 @@ public:
} }
/** Applies translation to vector */ /** Applies translation to vector */
inline VectorType operator* (const VectorType& other) const template<typename Derived>
{ return m_coeffs + other; } inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
operator* (const MatrixBase<Derived>& vec) const
{ return m_coeffs + vec.derived(); }
/** \returns the inverse translation (opposite) */ /** \returns the inverse translation (opposite) */
Translation inverse() const { return Translation(-m_coeffs); } Translation inverse() const { return Translation(-m_coeffs); }

View File

@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
{ {
*this *= Scalar(1)-tau; *this *= Scalar(1)-tau;
} }
else else if(tau!=Scalar(0))
{ {
Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols()); Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols()); Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
{ {
*this *= Scalar(1)-tau; *this *= Scalar(1)-tau;
} }
else else if(tau!=Scalar(0))
{ {
Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows()); Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1); Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);

View File

@ -156,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
* *
* \sa permutationQ() * \sa permutationQ()
*/ */
inline const PermutationPType& permutationP() const EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
{ {
eigen_assert(m_isInitialized && "LU is not initialized."); eigen_assert(m_isInitialized && "LU is not initialized.");
return m_p; return m_p;
@ -406,8 +406,8 @@ template<typename _MatrixType> class FullPivLU
MatrixType reconstructedMatrix() const; MatrixType reconstructedMatrix() const;
inline Index rows() const { return m_lu.rows(); } EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
inline Index cols() const { return m_lu.cols(); } EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType> template<typename RhsType, typename DstType>

View File

@ -665,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
// only worsening the precision of U and V as we accumulate more rotations // only worsening the precision of U and V as we accumulate more rotations
const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon(); const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
// limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286) // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
// FIXME What about considerering any denormal numbers as zero, using: const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
// const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
// Scaling factor to reduce over/under-flows // Scaling factor to reduce over/under-flows
RealScalar scale = matrix.cwiseAbs().maxCoeff(); RealScalar scale = matrix.cwiseAbs().maxCoeff();

View File

@ -141,6 +141,15 @@ template<typename Derived> class SparseMatrixBase
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
#ifdef EIGEN_PARSED_BY_DOXYGEN
#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP) /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
#else
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
#endif
# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseUnaryOps.h"
# include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h"
# include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h"
@ -149,8 +158,10 @@ template<typename Derived> class SparseMatrixBase
# ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN # ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
# include EIGEN_SPARSEMATRIXBASE_PLUGIN # include EIGEN_SPARSEMATRIXBASE_PLUGIN
# endif # endif
# undef EIGEN_CURRENT_STORAGE_BASE_CLASS
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
#undef EIGEN_DOC_UNARY_ADDONS
#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
/** \returns the number of rows. \sa cols() */ /** \returns the number of rows. \sa cols() */
inline Index rows() const { return derived().rows(); } inline Index rows() const { return derived().rows(); }

File diff suppressed because it is too large Load Diff

View File

@ -36,8 +36,10 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
/** \returns an expression of the opposite of \c *this /// \returns an expression of the opposite of \c *this
*/ ///
EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const NegativeReturnType inline const NegativeReturnType
operator-() const { return NegativeReturnType(derived()); } operator-() const { return NegativeReturnType(derived()); }
@ -45,13 +47,15 @@ operator-() const { return NegativeReturnType(derived()); }
template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; }; template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
/** \returns an expression of *this with the \a Scalar type casted to /// \returns an expression of \c *this with the \a Scalar type casted to
* \a NewScalar. /// \a NewScalar.
* ///
* The template parameter \a NewScalar is the type we are casting the scalars to. /// The template parameter \a NewScalar is the type we are casting the scalars to.
* ///
* \sa class CwiseUnaryOp EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
*/ ///
/// \sa class CwiseUnaryOp
///
template<typename NewType> template<typename NewType>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
typename CastXpr<NewType>::Type typename CastXpr<NewType>::Type
@ -60,9 +64,11 @@ cast() const
return typename CastXpr<NewType>::Type(derived()); return typename CastXpr<NewType>::Type(derived());
} }
/** \returns an expression of the complex conjugate of \c *this. /// \returns an expression of the complex conjugate of \c *this.
* ///
* \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */ EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
///
/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline ConjugateReturnType inline ConjugateReturnType
conjugate() const conjugate() const
@ -70,39 +76,45 @@ conjugate() const
return ConjugateReturnType(derived()); return ConjugateReturnType(derived());
} }
/** \returns a read-only expression of the real part of \c *this. /// \returns a read-only expression of the real part of \c *this.
* ///
* \sa imag() */ EIGEN_DOC_UNARY_ADDONS(real,real part function)
///
/// \sa imag()
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline RealReturnType inline RealReturnType
real() const { return RealReturnType(derived()); } real() const { return RealReturnType(derived()); }
/** \returns an read-only expression of the imaginary part of \c *this. /// \returns an read-only expression of the imaginary part of \c *this.
* ///
* \sa real() */ EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
///
/// \sa real()
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const ImagReturnType inline const ImagReturnType
imag() const { return ImagReturnType(derived()); } imag() const { return ImagReturnType(derived()); }
/** \brief Apply a unary operator coefficient-wise /// \brief Apply a unary operator coefficient-wise
* \param[in] func Functor implementing the unary operator /// \param[in] func Functor implementing the unary operator
* \tparam CustomUnaryOp Type of \a func /// \tparam CustomUnaryOp Type of \a func
* \returns An expression of a custom coefficient-wise unary operator \a func of *this /// \returns An expression of a custom coefficient-wise unary operator \a func of *this
* ///
* The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions. /// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
* ///
* Example: /// Example:
* \include class_CwiseUnaryOp_ptrfun.cpp /// \include class_CwiseUnaryOp_ptrfun.cpp
* Output: \verbinclude class_CwiseUnaryOp_ptrfun.out /// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
* ///
* Genuine functors allow for more possibilities, for instance it may contain a state. /// Genuine functors allow for more possibilities, for instance it may contain a state.
* ///
* Example: /// Example:
* \include class_CwiseUnaryOp.cpp /// \include class_CwiseUnaryOp.cpp
* Output: \verbinclude class_CwiseUnaryOp.out /// Output: \verbinclude class_CwiseUnaryOp.out
* ///
* \sa class CwiseUnaryOp, class CwiseBinaryOp EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
*/ ///
/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
///
template<typename CustomUnaryOp> template<typename CustomUnaryOp>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const CwiseUnaryOp<CustomUnaryOp, const Derived> inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@ -111,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func); return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
} }
/** \returns an expression of a custom coefficient-wise unary operator \a func of *this /// \returns an expression of a custom coefficient-wise unary operator \a func of *this
* ///
* The template parameter \a CustomUnaryOp is the type of the functor /// The template parameter \a CustomUnaryOp is the type of the functor
* of the custom unary operator. /// of the custom unary operator.
* ///
* Example: /// Example:
* \include class_CwiseUnaryOp.cpp /// \include class_CwiseUnaryOp.cpp
* Output: \verbinclude class_CwiseUnaryOp.out /// Output: \verbinclude class_CwiseUnaryOp.out
* ///
* \sa class CwiseUnaryOp, class CwiseBinaryOp EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
*/ ///
/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
///
template<typename CustomViewOp> template<typename CustomViewOp>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const CwiseUnaryView<CustomViewOp, const Derived> inline const CwiseUnaryView<CustomViewOp, const Derived>
@ -130,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func); return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
} }
/** \returns a non const expression of the real part of \c *this. /// \returns a non const expression of the real part of \c *this.
* ///
* \sa imag() */ EIGEN_DOC_UNARY_ADDONS(real,real part function)
///
/// \sa imag()
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline NonConstRealReturnType inline NonConstRealReturnType
real() { return NonConstRealReturnType(derived()); } real() { return NonConstRealReturnType(derived()); }
/** \returns a non const expression of the imaginary part of \c *this. /// \returns a non const expression of the imaginary part of \c *this.
* ///
* \sa real() */ EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
///
/// \sa real()
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline NonConstImagReturnType inline NonConstImagReturnType
imag() { return NonConstImagReturnType(derived()); } imag() { return NonConstImagReturnType(derived()); }

View File

@ -11,63 +11,75 @@
// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions. // This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
// This include MatrixBase and SparseMatrixBase. // This include MatrixBase and SparseMatrixBase.
typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType; typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType; typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType; typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType; typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType; typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
/** \returns an expression of the coefficient-wise absolute value of \c *this /// \returns an expression of the coefficient-wise absolute value of \c *this
* ///
* Example: \include MatrixBase_cwiseAbs.cpp /// Example: \include MatrixBase_cwiseAbs.cpp
* Output: \verbinclude MatrixBase_cwiseAbs.out /// Output: \verbinclude MatrixBase_cwiseAbs.out
* ///
* \sa cwiseAbs2() EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
*/ ///
/// \sa cwiseAbs2()
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const CwiseAbsReturnType EIGEN_STRONG_INLINE const CwiseAbsReturnType
cwiseAbs() const { return CwiseAbsReturnType(derived()); } cwiseAbs() const { return CwiseAbsReturnType(derived()); }
/** \returns an expression of the coefficient-wise squared absolute value of \c *this /// \returns an expression of the coefficient-wise squared absolute value of \c *this
* ///
* Example: \include MatrixBase_cwiseAbs2.cpp /// Example: \include MatrixBase_cwiseAbs2.cpp
* Output: \verbinclude MatrixBase_cwiseAbs2.out /// Output: \verbinclude MatrixBase_cwiseAbs2.out
* ///
* \sa cwiseAbs() EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
*/ ///
/// \sa cwiseAbs()
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const CwiseAbs2ReturnType EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); } cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
/** \returns an expression of the coefficient-wise square root of *this. /// \returns an expression of the coefficient-wise square root of *this.
* ///
* Example: \include MatrixBase_cwiseSqrt.cpp /// Example: \include MatrixBase_cwiseSqrt.cpp
* Output: \verbinclude MatrixBase_cwiseSqrt.out /// Output: \verbinclude MatrixBase_cwiseSqrt.out
* ///
* \sa cwisePow(), cwiseSquare() EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
*/ ///
/// \sa cwisePow(), cwiseSquare()
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const CwiseSqrtReturnType inline const CwiseSqrtReturnType
cwiseSqrt() const { return CwiseSqrtReturnType(derived()); } cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
/** \returns an expression of the coefficient-wise signum of *this. /// \returns an expression of the coefficient-wise signum of *this.
* ///
* Example: \include MatrixBase_cwiseSign.cpp /// Example: \include MatrixBase_cwiseSign.cpp
* Output: \verbinclude MatrixBase_cwiseSign.out /// Output: \verbinclude MatrixBase_cwiseSign.out
* ///
*/ EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const CwiseSignReturnType inline const CwiseSignReturnType
cwiseSign() const { return CwiseSignReturnType(derived()); } cwiseSign() const { return CwiseSignReturnType(derived()); }
/** \returns an expression of the coefficient-wise inverse of *this. /// \returns an expression of the coefficient-wise inverse of *this.
* ///
* Example: \include MatrixBase_cwiseInverse.cpp /// Example: \include MatrixBase_cwiseInverse.cpp
* Output: \verbinclude MatrixBase_cwiseInverse.out /// Output: \verbinclude MatrixBase_cwiseInverse.out
* ///
* \sa cwiseProduct() EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
*/ ///
/// \sa cwiseProduct()
///
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const CwiseInverseReturnType inline const CwiseInverseReturnType
cwiseInverse() const { return CwiseInverseReturnType(derived()); } cwiseInverse() const { return CwiseInverseReturnType(derived()); }

View File

@ -1,10 +1,13 @@
find_package(BLAZE) find_package(BLAZE)
find_package(Boost) find_package(Boost COMPONENTS system)
if (BLAZE_FOUND AND Boost_FOUND) if (BLAZE_FOUND AND Boost_FOUND)
include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
btl_add_bench(btl_blaze main.cpp) btl_add_bench(btl_blaze main.cpp)
# Note: The newest blaze version requires C++14.
# Ideally, we should set this depending on the version of Blaze we found
set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
if(BUILD_btl_blaze) if(BUILD_btl_blaze)
target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) target_link_libraries(btl_blaze ${Boost_LIBRARIES})
endif() endif()
endif () endif ()

View File

@ -53,6 +53,33 @@ showing that the program works as expected:
This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch. This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
\section NullaryExpr_Indexing Example 2: indexing rows and columns
The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
\snippet nullary_indexing.out main1
To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
\snippet nullary_indexing.cpp functor
Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
\snippet nullary_indexing.cpp function
Finally, here is an example of how this function can be used:
\snippet nullary_indexing.cpp main1
This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
\snippet nullary_indexing.cpp main2
and the output is:
\snippet nullary_indexing.out main2
*/ */
} }

View File

@ -1612,7 +1612,10 @@ EXPAND_AS_DEFINED = EIGEN_MAKE_TYPEDEFS \
EIGEN_EMPTY \ EIGEN_EMPTY \
EIGEN_EULER_ANGLES_TYPEDEFS \ EIGEN_EULER_ANGLES_TYPEDEFS \
EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
EIGEN_EULER_SYSTEM_TYPEDEF EIGEN_EULER_SYSTEM_TYPEDEF \
EIGEN_DOC_UNARY_ADDONS \
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
# doxygen's preprocessor will remove all references to function-like macros # doxygen's preprocessor will remove all references to function-like macros

View File

@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
) )
add_dependencies(all_examples ${example}) add_dependencies(all_examples ${example})
endforeach(example_src) endforeach(example_src)
check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
if(EIGEN_COMPILER_SUPPORT_CPP11)
ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
endif()

View File

@ -0,0 +1,52 @@
#include <Eigen/Core>
#include <iostream>
using namespace Eigen;
// [circulant_func]
template<class ArgType>
class circulant_functor {
const ArgType &m_vec;
public:
circulant_functor(const ArgType& arg) : m_vec(arg) {}
const typename ArgType::Scalar& operator() (Index row, Index col) const {
Index index = row - col;
if (index < 0) index += m_vec.size();
return m_vec(index);
}
};
// [circulant_func]
// [square]
template<class ArgType>
struct circulant_helper {
typedef Matrix<typename ArgType::Scalar,
ArgType::SizeAtCompileTime,
ArgType::SizeAtCompileTime,
ColMajor,
ArgType::MaxSizeAtCompileTime,
ArgType::MaxSizeAtCompileTime> MatrixType;
};
// [square]
// [makeCirculant]
template <class ArgType>
CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
{
typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
}
// [makeCirculant]
// [main]
int main()
{
Eigen::VectorXd vec(4);
vec << 1, 2, 4, 8;
Eigen::MatrixXd mat;
mat = makeCirculant(vec);
std::cout << mat << std::endl;
}
// [main]

View File

@ -0,0 +1,66 @@
#include <Eigen/Core>
#include <iostream>
using namespace Eigen;
// [functor]
template<class ArgType, class RowIndexType, class ColIndexType>
class indexing_functor {
const ArgType &m_arg;
const RowIndexType &m_rowIndices;
const ColIndexType &m_colIndices;
public:
typedef Matrix<typename ArgType::Scalar,
RowIndexType::SizeAtCompileTime,
ColIndexType::SizeAtCompileTime,
ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
RowIndexType::MaxSizeAtCompileTime,
ColIndexType::MaxSizeAtCompileTime> MatrixType;
indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
: m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
{}
const typename ArgType::Scalar& operator() (Index row, Index col) const {
return m_arg(m_rowIndices[row], m_colIndices[col]);
}
};
// [functor]
// [function]
template <class ArgType, class RowIndexType, class ColIndexType>
CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
{
typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
typedef typename Func::MatrixType MatrixType;
return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
}
// [function]
int main()
{
std::cout << "[main1]\n";
Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
Array3i ri(1,2,1);
ArrayXi ci(6); ci << 3,2,1,0,0,2;
Eigen::MatrixXi B = indexing(A, ri, ci);
std::cout << "A =" << std::endl;
std::cout << A << std::endl << std::endl;
std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
std::cout << B << std::endl;
std::cout << "[main1]\n";
std::cout << "[main2]\n";
B = indexing(A, ri+1, ci);
std::cout << "A(ri+1,ci) =" << std::endl;
std::cout << B << std::endl << std::endl;
#if __cplusplus >= 201103L
B = indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
std::cout << B << std::endl << std::endl;
#endif
std::cout << "[main2]\n";
}

View File

@ -355,7 +355,7 @@ if(CUDA_FOUND)
set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_PROPAGATE_HOST_FLAGS OFF)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
endif() endif()
if(EIGEN_TEST_CUDA_CLANG) if(EIGEN_TEST_CUDA_CLANG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")

View File

@ -417,6 +417,7 @@ void cholesky_faillure_cases()
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
VERIFY(ldlt.info()==NumericalIssue); VERIFY(ldlt.info()==NumericalIssue);
} }
#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
{ {
mat.resize(3,3); mat.resize(3,3);
mat << -1, -3, 3, mat << -1, -3, 3,
@ -426,6 +427,7 @@ void cholesky_faillure_cases()
VERIFY(ldlt.info()==NumericalIssue); VERIFY(ldlt.info()==NumericalIssue);
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
} }
#endif
{ {
mat.resize(3,3); mat.resize(3,3);
mat << 1, 2, 3, mat << 1, 2, 3,

View File

@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
VERIFY( !m.allFinite() ); VERIFY( !m.allFinite() );
VERIFY( m.hasNaN() ); VERIFY( m.hasNaN() );
} }
m(4) /= T(0.0); T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
m(4) /= hidden_zero;
if(dryrun) if(dryrun)
{ {
std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n"; std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";

View File

@ -111,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() ); VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() ); VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
} }
void test_geo_homogeneous() void test_geo_homogeneous()

View File

@ -334,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
t0.scale(v0); t0.scale(v0);
t1 *= AlignedScaling3(v0); t1 *= AlignedScaling3(v0);
VERIFY_IS_APPROX(t0.matrix(), t1.matrix()); VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
t1 = t1 * v0.asDiagonal();
VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
// transformation * translation // transformation * translation
t0.translate(v0); t0.translate(v0);
t1 = t1 * Translation3(v0); t1 = t1 * Translation3(v0);
@ -482,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
Rotation2D<Scalar> r2(r1); // copy ctor Rotation2D<Scalar> r2(r1); // copy ctor
VERIFY_IS_APPROX(r2.angle(),s0); VERIFY_IS_APPROX(r2.angle(),s0);
} }
{
Transform3 t32(Matrix4::Random()), t33, t34;
t34 = t33 = t32;
t32.scale(v0);
t33*=AlignedScaling3(v0);
VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
t33 = t34 * AlignedScaling3(v0);
VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
}
}
template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
{
VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
VERIFY_IS_APPROX( q*(p*h).hnormalized(), ((q*p)*h).hnormalized() );
}
template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
{
VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
transform_associativity_left(a1, a2,p, q, v, h);
}
template<typename Scalar, int Dim, int Options,typename RotationType>
void transform_associativity(const RotationType& R)
{
typedef Matrix<Scalar,Dim,1> VectorType;
typedef Matrix<Scalar,Dim+1,1> HVectorType;
typedef Matrix<Scalar,Dim,Dim> LinearType;
typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
typedef Transform<Scalar,Dim,Affine,Options> AffineType;
typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
typedef DiagonalMatrix<Scalar,Dim> ScalingType;
typedef Translation<Scalar,Dim> TranslationType;
AffineCompactType A1c; A1c.matrix().setRandom();
AffineCompactType A2c; A2c.matrix().setRandom();
AffineType A1(A1c);
AffineType A2(A2c);
ProjectiveType P1; P1.matrix().setRandom();
VectorType v1 = VectorType::Random();
VectorType v2 = VectorType::Random();
HVectorType h1 = HVectorType::Random();
Scalar s1 = internal::random<Scalar>();
LinearType L = LinearType::Random();
MatrixType M = MatrixType::Random();
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
VERIFY_IS_APPROX( M*(P1*h1), ((M*P1)*h1) );
} }
template<typename Scalar> void transform_alignment() template<typename Scalar> void transform_alignment()
@ -562,5 +638,8 @@ void test_geo_transformations()
CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() )); CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() )); CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
} }
} }

View File

@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
} }
if (PacketTraits::HasTanh) { if (PacketTraits::HasTanh) {
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
data1[0] = std::numeric_limits<Scalar>::quiet_NaN(); data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h; packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
h.store(data2, internal::ptanh(h.load(data1))); h.store(data2, internal::ptanh(h.load(data1)));

View File

@ -256,6 +256,51 @@ Index compute_block_size()
return ret; return ret;
} }
template<int>
void bug_1308()
{
int n = 10;
MatrixXd r(n,n);
VectorXd v = VectorXd::Random(n);
r = v * RowVectorXd::Ones(n);
VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
r = VectorXd::Ones(n) * v.transpose();
VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
Matrix4d ones44 = Matrix4d::Ones();
Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
typedef Matrix<double,4,4,RowMajor> RMatrix4d;
RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
// RowVector4d r4;
m44.setOnes();
r44.setZero();
VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
r44.setZero();
VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
r44.setZero();
VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
r44.setZero();
VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
}
void test_product_extra() void test_product_extra()
{ {
for(int i = 0; i < g_repeat; i++) { for(int i = 0; i < g_repeat; i++) {
@ -268,8 +313,10 @@ void test_product_extra()
} }
CALL_SUBTEST_5( bug_127<0>() ); CALL_SUBTEST_5( bug_127<0>() );
CALL_SUBTEST_5( bug_817<0>() ); CALL_SUBTEST_5( bug_817<0>() );
CALL_SUBTEST_5( bug_1308<0>() );
CALL_SUBTEST_6( unaligned_objects<0>() ); CALL_SUBTEST_6( unaligned_objects<0>() );
CALL_SUBTEST_7( compute_block_size<float>() ); CALL_SUBTEST_7( compute_block_size<float>() );
CALL_SUBTEST_7( compute_block_size<double>() ); CALL_SUBTEST_7( compute_block_size<double>() );
CALL_SUBTEST_7( compute_block_size<std::complex<double> >() ); CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
} }

View File

@ -12,6 +12,7 @@
#include <Eigen/LU> #include <Eigen/LU>
// regression test for bug 447 // regression test for bug 447
template<int>
void product1x1() void product1x1()
{ {
Matrix<float,1,3> matAstatic; Matrix<float,1,3> matAstatic;
@ -209,15 +210,34 @@ void test_linear_but_not_vectorizable()
} }
} }
template<int Rows>
void bug_1311()
{
Matrix< double, Rows, 2 > A; A.setRandom();
Vector2d b = Vector2d::Random() ;
Matrix<double,Rows,1> res;
res.noalias() = 1. * (A * b);
VERIFY_IS_APPROX(res, A*b);
res.noalias() = 1.*A * b;
VERIFY_IS_APPROX(res, A*b);
res.noalias() = (1.*A).lazyProduct(b);
VERIFY_IS_APPROX(res, A*b);
res.noalias() = (1.*A).lazyProduct(1.*b);
VERIFY_IS_APPROX(res, A*b);
res.noalias() = (A).lazyProduct(1.*b);
VERIFY_IS_APPROX(res, A*b);
}
void test_product_small() void test_product_small()
{ {
for(int i = 0; i < g_repeat; i++) { for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) ); CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) ); CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
CALL_SUBTEST_3( product(Matrix3d()) ); CALL_SUBTEST_3( product(Matrix3d()) );
CALL_SUBTEST_4( product(Matrix4d()) ); CALL_SUBTEST_4( product(Matrix4d()) );
CALL_SUBTEST_5( product(Matrix4f()) ); CALL_SUBTEST_5( product(Matrix4f()) );
CALL_SUBTEST_6( product1x1() ); CALL_SUBTEST_6( product1x1<0>() );
CALL_SUBTEST_11( test_lazy_l1<float>() ); CALL_SUBTEST_11( test_lazy_l1<float>() );
CALL_SUBTEST_12( test_lazy_l2<float>() ); CALL_SUBTEST_12( test_lazy_l2<float>() );
@ -238,6 +258,9 @@ void test_product_small()
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() )); CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() )); CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() )); CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
CALL_SUBTEST_6( bug_1311<3>() );
CALL_SUBTEST_6( bug_1311<5>() );
} }
#ifdef EIGEN_TEST_PART_6 #ifdef EIGEN_TEST_PART_6

View File

@ -7,6 +7,16 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
template<typename T>
Array<T,4,1> four_denorms();
template<>
Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
template<>
Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
template<typename T>
Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
template<typename MatrixType> template<typename MatrixType>
void svd_fill_random(MatrixType &m, int Option = 0) void svd_fill_random(MatrixType &m, int Option = 0)
{ {
@ -55,7 +65,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
} }
Matrix<Scalar,Dynamic,1> samples(9); Matrix<Scalar,Dynamic,1> samples(9);
samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8); samples << 0, four_denorms<RealScalar>(),
-RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
if(Option==Symmetric) if(Option==Symmetric)
{ {

View File

@ -61,8 +61,9 @@ typedef unsigned __int64 uint64_t;
#ifdef EIGEN_USE_GPU #ifdef EIGEN_USE_GPU
#include <iostream> #include <iostream>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#if defined(__CUDACC__) #if __cplusplus >= 201103L
#include <curand_kernel.h> #include <atomic>
#include <unistd.h>
#endif #endif
#endif #endif
@ -81,6 +82,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorDimensions.h" #include "src/Tensor/TensorDimensions.h"
#include "src/Tensor/TensorInitializer.h" #include "src/Tensor/TensorInitializer.h"
#include "src/Tensor/TensorTraits.h" #include "src/Tensor/TensorTraits.h"
#include "src/Tensor/TensorRandom.h"
#include "src/Tensor/TensorUInt128.h" #include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h" #include "src/Tensor/TensorIntDiv.h"
#include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorGlobalFunctions.h"

View File

@ -51,12 +51,15 @@ class TensorOpCost {
internal::scalar_cast_op<SrcType, TargetType> >::Cost; internal::scalar_cast_op<SrcType, TargetType> >::Cost;
} }
EIGEN_DEVICE_FUNC
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
: bytes_loaded_(bytes_loaded), : bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored), bytes_stored_(bytes_stored),
compute_cycles_(compute_cycles) {} compute_cycles_(compute_cycles) {}
EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
bool vectorized, double packet_size) bool vectorized, double packet_size)
: bytes_loaded_(bytes_loaded), : bytes_loaded_(bytes_loaded),

View File

@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;
static void initializeDeviceProp() { static void initializeDeviceProp() {
if (!m_devicePropInitialized) { if (!m_devicePropInitialized) {
if (!m_devicePropInitialized) { // Attempts to ensure proper behavior in the case of multiple threads
// calling this function simultaneously. This would be trivial to
// implement if we could use std::mutex, but unfortunately mutex don't
// compile with nvcc, so we resort to atomics and thread fences instead.
// Note that if the caller uses a compiler that doesn't support c++11 we
// can't ensure that the initialization is thread safe.
#if __cplusplus >= 201103L
static std::atomic<bool> first(true);
if (first.exchange(false)) {
#else
static bool first = true;
if (first) {
first = false;
#endif
// We're the first thread to reach this point.
int num_devices; int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices); cudaError_t status = cudaGetDeviceCount(&num_devices);
if (status != cudaSuccess) { if (status != cudaSuccess) {
@ -63,7 +77,19 @@ static void initializeDeviceProp() {
assert(status == cudaSuccess); assert(status == cudaSuccess);
} }
} }
#if __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_release);
#endif
m_devicePropInitialized = true; m_devicePropInitialized = true;
} else {
// Wait for the other thread to inititialize the properties.
while (!m_devicePropInitialized) {
#if __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_acquire);
#endif
sleep(1);
}
} }
} }
} }
@ -168,39 +194,20 @@ struct GpuDevice {
return stream_->stream(); return stream_->stream();
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
#ifndef __CUDA_ARCH__
return stream_->allocate(num_bytes); return stream_->allocate(num_bytes);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
stream_->deallocate(buffer); stream_->deallocate(buffer);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const { EIGEN_STRONG_INLINE void* scratchpad() const {
#ifndef __CUDA_ARCH__
return stream_->scratchpad(); return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const { EIGEN_STRONG_INLINE unsigned int* semaphore() const {
#ifndef __CUDA_ARCH__
return stream_->semaphore(); return stream_->semaphore();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@ -210,30 +217,22 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == cudaSuccess);
#else #else
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@ -242,21 +241,21 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == cudaSuccess);
#else #else
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME // FIXME
return 32; return 32;
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME // FIXME
return 48*1024; return 48*1024;
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and // We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices. // there is no l3 cache on cuda devices.
return firstLevelCacheSize(); return firstLevelCacheSize();
@ -276,56 +275,26 @@ struct GpuDevice {
#endif #endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().multiProcessorCount; return stream_->deviceProperties().multiProcessorCount;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerBlock; return stream_->deviceProperties().maxThreadsPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerMultiProcessor; return stream_->deviceProperties().maxThreadsPerMultiProcessor;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().sharedMemPerBlock; return stream_->deviceProperties().sharedMemPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().major; return stream_->deviceProperties().major;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const { EIGEN_STRONG_INLINE int minorDeviceVersion() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().minor; return stream_->deviceProperties().minor;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_; return max_blocks_;
} }

View File

@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
template <typename Evaluator, typename Index> template <typename Evaluator, typename Index>
__global__ void __global__ void
__launch_bounds__(1024) __launch_bounds__(1024)
EigenMetaKernel(Evaluator memcopied_eval, Index size) { EigenMetaKernel(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x; const Index step_size = blockDim.x * gridDim.x;
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
// complex types such as evaluators we should really conform to the C++
// standard and call a proper copy constructor.
Evaluator eval(memcopied_eval);
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
} }

View File

@ -99,7 +99,8 @@ template <typename T> struct SumReducer
static const bool IsStateful = false; static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
(*accum) += t; internal::scalar_sum_op<T> sum_op;
*accum = sum_op(*accum, t);
} }
template <typename Packet> template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
MeanReducer() : scalarCount_(0), packetCount_(0) { } MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
(*accum) += t; internal::scalar_sum_op<T> sum_op;
*accum = sum_op(*accum, t);
scalarCount_++; scalarCount_++;
} }
template <typename Packet> template <typename Packet>
@ -190,25 +192,25 @@ struct reducer_traits<MeanReducer<T>, Device> {
template <typename T, bool IsMax = true, bool IsInteger = true> template <typename T, bool IsMax = true, bool IsInteger = true>
struct MinMaxBottomValue { struct MinMaxBottomValue {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::lowest(); return Eigen::NumTraits<T>::lowest();
} }
}; };
template <typename T> template <typename T>
struct MinMaxBottomValue<T, true, false> { struct MinMaxBottomValue<T, true, false> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return -Eigen::NumTraits<T>::infinity(); return -Eigen::NumTraits<T>::infinity();
} }
}; };
template <typename T> template <typename T>
struct MinMaxBottomValue<T, false, true> { struct MinMaxBottomValue<T, false, true> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::highest(); return Eigen::NumTraits<T>::highest();
} }
}; };
template <typename T> template <typename T>
struct MinMaxBottomValue<T, false, false> { struct MinMaxBottomValue<T, false, false> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::infinity(); return Eigen::NumTraits<T>::infinity();
} }
}; };
@ -439,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
}; };
// Random number generation
namespace {
#ifdef __CUDA_ARCH__
__device__ int get_random_seed() {
return clock();
}
#else
static inline int get_random_seed() {
#ifdef _WIN32
SYSTEMTIME st;
GetSystemTime(&st);
return st.wSecond + 1000 * st.wMilliseconds;
#elif defined __APPLE__
return static_cast<int>(mach_absolute_time());
#else
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return static_cast<int>(ts.tv_nsec);
#endif
}
#endif
}
#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
// We're not compiling a cuda kernel
template <typename T> class UniformRandomGenerator {
public:
static const bool PacketAccess = true;
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
if (!deterministic) {
srand(get_random_seed());
}
}
UniformRandomGenerator(const UniformRandomGenerator& other) {
m_deterministic = other.m_deterministic;
}
T operator()() const {
return random<T>();
}
template<typename PacketType>
PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = random<T>();
}
return internal::pload<PacketType>(values);
}
private:
bool m_deterministic;
};
#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
template <> class UniformRandomGenerator<float> {
public:
static const bool PacketAccess = true;
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
if (!deterministic) {
m_generator->seed(get_random_seed());
}
}
UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
m_generator = new std::mt19937();
m_generator->seed(other() * UINT_MAX);
m_deterministic = other.m_deterministic;
}
~UniformRandomGenerator() {
delete m_generator;
}
float operator()() const {
return m_distribution(*m_generator);
}
template<typename PacketType>
PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX float values[packetSize];
for (int k = 0; k < packetSize; ++k) {
values[k] = this->operator()();
}
return internal::pload<PacketType>(values);
}
private:
UniformRandomGenerator& operator = (const UniformRandomGenerator&);
// Make sure m_deterministic comes first to match the layout of the cpu
// version of the code.
bool m_deterministic;
std::mt19937* m_generator;
mutable std::uniform_real_distribution<float> m_distribution;
};
template <> class UniformRandomGenerator<double> {
public:
static const bool PacketAccess = true;
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
if (!deterministic) {
m_generator->seed(get_random_seed());
}
}
UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
m_generator = new std::mt19937();
m_generator->seed(other() * UINT_MAX);
m_deterministic = other.m_deterministic;
}
~UniformRandomGenerator() {
delete m_generator;
}
double operator()() const {
return m_distribution(*m_generator);
}
template<typename PacketType>
PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX double values[packetSize];
for (int k = 0; k < packetSize; ++k) {
values[k] = this->operator()();
}
return internal::pload<PacketType>(values);
}
private:
UniformRandomGenerator& operator = (const UniformRandomGenerator&);
// Make sure m_deterministic comes first to match the layout of the cpu
// version of the code.
bool m_deterministic;
std::mt19937* m_generator;
mutable std::uniform_real_distribution<double> m_distribution;
};
#endif
#else
// We're compiling a cuda kernel
template <typename T> class UniformRandomGenerator;
template <> class UniformRandomGenerator<float> {
public:
static const bool PacketAccess = true;
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ float operator()() const {
return curand_uniform(&m_state);
}
template<typename PacketType>
__device__ float4 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_uniform4(&m_state);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class UniformRandomGenerator<double> {
public:
static const bool PacketAccess = true;
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ double operator()() const {
return curand_uniform_double(&m_state);
}
template<typename PacketType>
__device__ double2 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_uniform2_double(&m_state);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class UniformRandomGenerator<std::complex<float> > {
public:
static const bool PacketAccess = false;
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ std::complex<float> operator()() const {
float4 vals = curand_uniform4(&m_state);
return std::complex<float>(vals.x, vals.y);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class UniformRandomGenerator<std::complex<double> > {
public:
static const bool PacketAccess = false;
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ std::complex<double> operator()() const {
double2 vals = curand_uniform2_double(&m_state);
return std::complex<double>(vals.x, vals.y);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
#endif
template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
enum {
// Rough estimate.
Cost = 100 * NumTraits<Scalar>::MulCost,
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
};
};
#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
// We're not compiling a cuda kernel
template <typename T> class NormalRandomGenerator {
public:
static const bool PacketAccess = true;
NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
if (!deterministic) {
m_generator->seed(get_random_seed());
}
}
NormalRandomGenerator(const NormalRandomGenerator& other)
: m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
m_generator->seed(other() * UINT_MAX);
}
~NormalRandomGenerator() {
delete m_generator;
}
T operator()() const {
return m_distribution(*m_generator);
}
template<typename PacketType>
PacketType packetOp() const {
const int packetSize = internal::unpacket_traits<PacketType>::size;
EIGEN_ALIGN_MAX T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = m_distribution(*m_generator);
}
return internal::pload<PacketType>(values);
}
private:
// No assignment
NormalRandomGenerator& operator = (const NormalRandomGenerator&);
bool m_deterministic;
mutable std::normal_distribution<T> m_distribution;
std::mt19937* m_generator;
};
#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
// We're compiling a cuda kernel
template <typename T> class NormalRandomGenerator;
template <> class NormalRandomGenerator<float> {
public:
static const bool PacketAccess = true;
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ float operator()() const {
return curand_normal(&m_state);
}
template<typename PacketType>
__device__ float4 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_normal4(&m_state);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class NormalRandomGenerator<double> {
public:
static const bool PacketAccess = true;
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ double operator()() const {
return curand_normal_double(&m_state);
}
template<typename PacketType>
__device__ double2 packetOp() const {
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return curand_normal2_double(&m_state);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class NormalRandomGenerator<std::complex<float> > {
public:
static const bool PacketAccess = false;
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ std::complex<float> operator()() const {
float4 vals = curand_normal4(&m_state);
return std::complex<float>(vals.x, vals.y);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
template <> class NormalRandomGenerator<std::complex<double> > {
public:
static const bool PacketAccess = false;
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
m_deterministic = other.m_deterministic;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int seed = m_deterministic ? 0 : get_random_seed();
curand_init(seed, tid, 0, &m_state);
}
__device__ std::complex<double> operator()() const {
double2 vals = curand_normal2_double(&m_state);
return std::complex<double>(vals.x, vals.y);
}
private:
bool m_deterministic;
mutable curandStatePhilox4_32_10_t m_state;
};
#else
template <typename T> class NormalRandomGenerator {
public:
static const bool PacketAccess = false;
NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
private:
bool m_deterministic;
};
#endif
template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
enum {
// Rough estimate.
Cost = 100 * NumTraits<Scalar>::MulCost,
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
};
};
template <typename T, typename Index, size_t NumDims> template <typename T, typename Index, size_t NumDims>
class GaussianGenerator { class GaussianGenerator {
public: public:
@ -895,7 +455,7 @@ class GaussianGenerator {
} }
} }
T operator()(const array<Index, NumDims>& coordinates) const { EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0); T tmp = T(0);
for (size_t i = 0; i < NumDims; ++i) { for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i]; T offset = coordinates[i] - m_means[i];

View File

@ -0,0 +1,276 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
namespace Eigen {
namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#ifdef __CUDA_ARCH__
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);
return clock64() +
blockIdx.x * blockDim.x + threadIdx.x +
gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
#elif defined _WIN32
// Use the current time as a baseline.
GetSystemTime(&st);
int time = st.wSecond + 1000 * st.wMilliseconds;
// Mix in a random number to make sure that we get different seeds if
// we try to generate seeds faster than the clock resolution.
// We need 2 random values since the generator only generate 16 bits at
// a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
SYSTEMTIME st;
uint rnd1 = ::rand();
uint rnd2 = ::rand();
uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
return rnd;
#elif defined __APPLE__
// Same approach as for win32, except that the random number generator
// is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
uint64_t rnd = ::random() ^ mach_absolute_time();
return rnd;
#else
// Augment the current time with pseudo random number generation
// to ensure that we get different seeds if we try to generate seeds
// faster than the clock resolution.
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
uint64_t rnd = ::random() ^ ts.tv_nsec;
return rnd;
#endif
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
// TODO: Unify with the implementation in the non blocking thread pool.
uint64_t current = *state;
// Update the internal state
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
// Generate the random output (using the PCG-XSH-RS scheme)
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
seed = seed ? seed : get_random_seed();
return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
}
} // namespace
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeUniform(uint64_t* state) {
unsigned rnd = PCG_XSH_RS_generator(state);
return static_cast<T>(rnd);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
Eigen::half result;
// Generate 10 random bits for the mantissa
unsigned rnd = PCG_XSH_RS_generator(state);
result.x = static_cast<uint16_t>(rnd & 0x3ffu);
// Set the exponent
result.x |= (static_cast<uint16_t>(15) << 10);
// Return the final result
return result - Eigen::half(1.0f);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float RandomToTypeUniform<float>(uint64_t* state) {
typedef union {
uint32_t raw;
float fp;
} internal;
internal result;
// Generate 23 random bits for the mantissa mantissa
const unsigned rnd = PCG_XSH_RS_generator(state);
result.raw = rnd & 0x7fffffu;
// Set the exponent
result.raw |= (static_cast<uint32_t>(127) << 23);
// Return the final result
return result.fp - 1.0f;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double RandomToTypeUniform<double>(uint64_t* state) {
typedef union {
uint64_t raw;
double dp;
} internal;
internal result;
result.raw = 0;
// Generate 52 random bits for the mantissa
// First generate the upper 20 bits
unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
// The generate the lower 32 bits
unsigned rnd2 = PCG_XSH_RS_generator(state);
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
// Set the exponent
result.raw |= (static_cast<uint64_t>(1023) << 52);
// Return the final result
return result.dp - 1.0;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
return std::complex<float>(RandomToTypeUniform<float>(state),
RandomToTypeUniform<float>(state));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
return std::complex<double>(RandomToTypeUniform<double>(state),
RandomToTypeUniform<double>(state));
}
template <typename T> class UniformRandomGenerator {
public:
static const bool PacketAccess = true;
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
const UniformRandomGenerator& other) {
m_state = other.m_state;
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
uint64_t local_state = m_state + i;
T result = RandomToTypeUniform<T>(&local_state);
m_state = local_state;
return result;
}
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeUniform<T>(&local_state);
}
m_state = local_state;
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
};
template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
enum {
// Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
Cost = 12 * NumTraits<Scalar>::AddCost *
((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
};
};
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeNormal(uint64_t* state) {
// Use the ratio of uniform method to generate numbers following a normal
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
// details.
T u, v, q;
do {
u = RandomToTypeUniform<T>(state);
v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
const T x = u - T(0.449871);
const T y = numext::abs(v) + T(0.386595);
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
} while (q > T(0.27597) &&
(q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
return v/u;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
return std::complex<float>(RandomToTypeNormal<float>(state),
RandomToTypeNormal<float>(state));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
return std::complex<double>(RandomToTypeNormal<double>(state),
RandomToTypeNormal<double>(state));
}
template <typename T> class NormalRandomGenerator {
public:
static const bool PacketAccess = true;
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
const NormalRandomGenerator& other) {
m_state = other.m_state;
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
uint64_t local_state = m_state + i;
T result = RandomToTypeNormal<T>(&local_state);
m_state = local_state;
return result;
}
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeNormal<T>(&local_state);
}
m_state = local_state;
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
};
template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
enum {
// On average, we need to generate about 3 random numbers
// 15 mul, 8 add, 1.5 logs
Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
};
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

View File

@ -20,37 +20,60 @@ public:
AutoDiffJacobian(const Functor& f) : Functor(f) {} AutoDiffJacobian(const Functor& f) : Functor(f) {}
// forward constructors // forward constructors
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... T>
AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
#else
template<typename T0> template<typename T0>
AutoDiffJacobian(const T0& a0) : Functor(a0) {} AutoDiffJacobian(const T0& a0) : Functor(a0) {}
template<typename T0, typename T1> template<typename T0, typename T1>
AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
template<typename T0, typename T1, typename T2> template<typename T0, typename T1, typename T2>
AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {} AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
#endif
enum {
InputsAtCompileTime = Functor::InputsAtCompileTime,
ValuesAtCompileTime = Functor::ValuesAtCompileTime
};
typedef typename Functor::InputType InputType; typedef typename Functor::InputType InputType;
typedef typename Functor::ValueType ValueType; typedef typename Functor::ValueType ValueType;
typedef typename Functor::JacobianType JacobianType; typedef typename ValueType::Scalar Scalar;
typedef typename JacobianType::Scalar Scalar;
enum {
InputsAtCompileTime = InputType::RowsAtCompileTime,
ValuesAtCompileTime = ValueType::RowsAtCompileTime
};
typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
typedef typename JacobianType::Index Index; typedef typename JacobianType::Index Index;
typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType; typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
typedef AutoDiffScalar<DerivativeType> ActiveScalar; typedef AutoDiffScalar<DerivativeType> ActiveScalar;
typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput; typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue; typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
#if EIGEN_HAS_VARIADIC_TEMPLATES
// Some compilers don't accept variadic parameters after a default parameter,
// i.e., we can't just write _jac=0 but we need to overload operator():
EIGEN_STRONG_INLINE
void operator() (const InputType& x, ValueType* v) const
{
this->operator()(x, v, 0);
}
template<typename... ParamsType>
void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
const ParamsType&... Params) const
#else
void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
#endif
{ {
eigen_assert(v!=0); eigen_assert(v!=0);
if (!_jac) if (!_jac)
{ {
#if EIGEN_HAS_VARIADIC_TEMPLATES
Functor::operator()(x, v, Params...);
#else
Functor::operator()(x, v); Functor::operator()(x, v);
#endif
return; return;
} }
@ -61,12 +84,16 @@ public:
if(InputsAtCompileTime==Dynamic) if(InputsAtCompileTime==Dynamic)
for (Index j=0; j<jac.rows(); j++) for (Index j=0; j<jac.rows(); j++)
av[j].derivatives().resize(this->inputs()); av[j].derivatives().resize(x.rows());
for (Index i=0; i<jac.cols(); i++) for (Index i=0; i<jac.cols(); i++)
ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i); ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
#if EIGEN_HAS_VARIADIC_TEMPLATES
Functor::operator()(ax, &av, Params...);
#else
Functor::operator()(ax, &av); Functor::operator()(ax, &av);
#endif
for (Index i=0; i<jac.rows(); i++) for (Index i=0; i<jac.rows(); i++)
{ {
@ -74,8 +101,6 @@ public:
jac.row(i) = av[i].derivatives(); jac.row(i) = av[i].derivatives();
} }
} }
protected:
}; };
} }

View File

@ -189,7 +189,12 @@ namespace Eigen
res[0] = atan2(mat(J,K), mat(K,K)); res[0] = atan2(mat(J,K), mat(K,K));
Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm(); Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) { if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
res[1] = atan2(-mat(I,K), -c2); res[1] = atan2(-mat(I,K), -c2);
} }
else else
@ -212,7 +217,12 @@ namespace Eigen
res[0] = atan2(mat(J,I), mat(K,I)); res[0] = atan2(mat(J,I), mat(K,I));
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
{ {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
res[1] = -atan2(s2, mat(I,I)); res[1] = -atan2(s2, mat(I,I));
} }

View File

@ -203,7 +203,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
endif() endif()
if(EIGEN_TEST_CUDA_CLANG) if(EIGEN_TEST_CUDA_CLANG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(cxx11_tensor_complex_cuda) ei_add_test(cxx11_tensor_complex_cuda)
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_reduction_cuda)
ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_argmax_cuda)
ei_add_test(cxx11_tensor_cast_float16_cuda) ei_add_test(cxx11_tensor_cast_float16_cuda)

View File

@ -105,6 +105,89 @@ struct TestFunc1
} }
}; };
#if EIGEN_HAS_VARIADIC_TEMPLATES
/* Test functor for the C++11 features. */
template <typename Scalar>
struct integratorFunctor
{
typedef Matrix<Scalar, 2, 1> InputType;
typedef Matrix<Scalar, 2, 1> ValueType;
/*
* Implementation starts here.
*/
integratorFunctor(const Scalar gain) : _gain(gain) {}
integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
const Scalar _gain;
template <typename T1, typename T2>
void operator() (const T1 &input, T2 *output, const Scalar dt) const
{
T2 &o = *output;
/* Integrator to test the AD. */
o[0] = input[0] + input[1] * dt * _gain;
o[1] = input[1] * _gain;
}
/* Only needed for the test */
template <typename T1, typename T2, typename T3>
void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
{
T2 &o = *output;
/* Integrator to test the AD. */
o[0] = input[0] + input[1] * dt * _gain;
o[1] = input[1] * _gain;
if (jacobian)
{
T3 &j = *jacobian;
j(0, 0) = 1;
j(0, 1) = dt * _gain;
j(1, 0) = 0;
j(1, 1) = _gain;
}
}
};
template<typename Func> void forward_jacobian_cpp11(const Func& f)
{
typedef typename Func::ValueType::Scalar Scalar;
typedef typename Func::ValueType ValueType;
typedef typename Func::InputType InputType;
typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
InputType x = InputType::Random(InputType::RowsAtCompileTime);
ValueType y, yref;
JacobianType j, jref;
const Scalar dt = internal::random<double>();
jref.setZero();
yref.setZero();
f(x, &yref, &jref, dt);
//std::cerr << "y, yref, jref: " << "\n";
//std::cerr << y.transpose() << "\n\n";
//std::cerr << yref << "\n\n";
//std::cerr << jref << "\n\n";
AutoDiffJacobian<Func> autoj(f);
autoj(x, &y, &j, dt);
//std::cerr << "y j (via autodiff): " << "\n";
//std::cerr << y.transpose() << "\n\n";
//std::cerr << j << "\n\n";
VERIFY_IS_APPROX(y, yref);
VERIFY_IS_APPROX(j, jref);
}
#endif
template<typename Func> void forward_jacobian(const Func& f) template<typename Func> void forward_jacobian(const Func& f)
{ {
typename Func::InputType x = Func::InputType::Random(f.inputs()); typename Func::InputType x = Func::InputType::Random(f.inputs());
@ -128,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
VERIFY_IS_APPROX(j, jref); VERIFY_IS_APPROX(j, jref);
} }
// TODO also check actual derivatives! // TODO also check actual derivatives!
template <int> template <int>
void test_autodiff_scalar() void test_autodiff_scalar()
@ -141,6 +223,7 @@ void test_autodiff_scalar()
VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y())); VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
} }
// TODO also check actual derivatives! // TODO also check actual derivatives!
template <int> template <int>
void test_autodiff_vector() void test_autodiff_vector()
@ -164,6 +247,9 @@ void test_autodiff_jacobian()
CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) )); CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) )); CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) )); CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
#if EIGEN_HAS_VARIADIC_TEMPLATES
CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
#endif
} }

View File

@ -71,8 +71,45 @@ void test_cuda_nullary() {
} }
static void test_cuda_sum_reductions() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024);
const int num_cols = internal::random<int>(1024, 5*1024);
Tensor<std::complex<float>, 2> in(num_rows, num_cols);
in.setRandom();
Tensor<std::complex<float>, 0> full_redux;
full_redux = in.sum();
std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
out_gpu.device(gpu_device) = in_gpu.sum();
Tensor<std::complex<float>, 0> full_redux_gpu;
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
gpu_device.synchronize();
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
gpu_device.deallocate(gpu_in_ptr);
gpu_device.deallocate(gpu_out_ptr);
}
void test_cxx11_tensor_complex() void test_cxx11_tensor_complex()
{ {
CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_nullary());
CALL_SUBTEST(test_cuda_sum_reductions());
} }

View File

@ -0,0 +1,97 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<typename T>
void test_cuda_complex_cwise_ops() {
const int kNumItems = 2;
std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
std::complex<T>* d_in1;
std::complex<T>* d_in2;
std::complex<T>* d_out;
cudaMalloc((void**)(&d_in1), complex_bytes);
cudaMalloc((void**)(&d_in2), complex_bytes);
cudaMalloc((void**)(&d_out), complex_bytes);
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
d_in1, kNumItems);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
d_in2, kNumItems);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
d_out, kNumItems);
const std::complex<T> a(3.14f, 2.7f);
const std::complex<T> b(-10.6f, 1.4f);
gpu_in1.device(gpu_device) = gpu_in1.constant(a);
gpu_in2.device(gpu_device) = gpu_in2.constant(b);
enum CwiseOp {
Add = 0,
Sub,
Mul,
Div
};
Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
for (int op = Add; op <= Div; op++) {
std::complex<T> expected;
switch (static_cast<CwiseOp>(op)) {
case Add:
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
expected = a + b;
break;
case Sub:
gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
expected = a - b;
break;
case Mul:
gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
expected = a * b;
break;
case Div:
gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
expected = a / b;
break;
}
assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < kNumItems; ++i) {
VERIFY_IS_APPROX(actual(i), expected);
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
}
void test_cxx11_tensor_complex_cwise_ops()
{
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
}