mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-21 12:24:25 +08:00
Pull the latest updates from trunk
This commit is contained in:
commit
ae1385c7e4
@ -411,6 +411,10 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/functors/StlFunctors.h"
|
||||
#include "src/Core/functors/AssignmentFunctors.h"
|
||||
|
||||
// Specialized functors to enable the processing of complex numbers
|
||||
// on CUDA devices
|
||||
#include "src/Core/arch/CUDA/Complex.h"
|
||||
|
||||
#include "src/Core/DenseCoeffsBase.h"
|
||||
#include "src/Core/DenseBase.h"
|
||||
#include "src/Core/MatrixBase.h"
|
||||
|
@ -87,6 +87,7 @@ template<typename Derived> class ArrayBase
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
|
||||
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
|
||||
# include "../plugins/CommonCwiseUnaryOps.h"
|
||||
# include "../plugins/MatrixCwiseUnaryOps.h"
|
||||
# include "../plugins/ArrayCwiseUnaryOps.h"
|
||||
@ -97,6 +98,7 @@ template<typename Derived> class ArrayBase
|
||||
# include EIGEN_ARRAYBASE_PLUGIN
|
||||
# endif
|
||||
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
|
||||
#undef EIGEN_DOC_UNARY_ADDONS
|
||||
|
||||
/** Special case of the template operator=, in order to prevent the compiler
|
||||
* from generating a default operator= (issue hit with g++ 4.1)
|
||||
|
@ -817,73 +817,79 @@ struct mapbase_evaluator : evaluator_base<Derived>
|
||||
ColsAtCompileTime = XprType::ColsAtCompileTime,
|
||||
CoeffReadCost = NumTraits<Scalar>::ReadCost
|
||||
};
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
|
||||
: m_data(const_cast<PointerType>(map.data())),
|
||||
m_xpr(map)
|
||||
: m_data(const_cast<PointerType>(map.data())),
|
||||
m_innerStride(map.innerStride()),
|
||||
m_outerStride(map.outerStride())
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
|
||||
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
CoeffReturnType coeff(Index row, Index col) const
|
||||
{
|
||||
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
|
||||
return m_data[col * colStride() + row * rowStride()];
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return m_data[index * m_xpr.innerStride()];
|
||||
return m_data[index * m_innerStride.value()];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Scalar& coeffRef(Index row, Index col)
|
||||
{
|
||||
return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
|
||||
return m_data[col * colStride() + row * rowStride()];
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Scalar& coeffRef(Index index)
|
||||
{
|
||||
return m_data[index * m_xpr.innerStride()];
|
||||
return m_data[index * m_innerStride.value()];
|
||||
}
|
||||
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE
|
||||
PacketType packet(Index row, Index col) const
|
||||
PacketType packet(Index row, Index col) const
|
||||
{
|
||||
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
return internal::ploadt<PacketType, LoadMode>(ptr);
|
||||
}
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE
|
||||
PacketType packet(Index index) const
|
||||
PacketType packet(Index index) const
|
||||
{
|
||||
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
|
||||
return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
|
||||
}
|
||||
|
||||
|
||||
template<int StoreMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE
|
||||
void writePacket(Index row, Index col, const PacketType& x)
|
||||
void writePacket(Index row, Index col, const PacketType& x)
|
||||
{
|
||||
PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
|
||||
PointerType ptr = m_data + row * rowStride() + col * colStride();
|
||||
return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
|
||||
}
|
||||
|
||||
|
||||
template<int StoreMode, typename PacketType>
|
||||
EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketType& x)
|
||||
void writePacket(Index index, const PacketType& x)
|
||||
{
|
||||
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
|
||||
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
|
||||
|
||||
PointerType m_data;
|
||||
const XprType& m_xpr;
|
||||
const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
|
||||
const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
|
||||
};
|
||||
|
||||
template<typename PlainObjectType, int MapOptions, typename StrideType>
|
||||
|
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
|
||||
*
|
||||
* The function generates 'size' equally spaced values in the closed interval [low,high].
|
||||
* This particular version of LinSpaced() uses sequential access, i.e. vector access is
|
||||
* assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
|
||||
* assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
|
||||
* and yields faster code than the random access version.
|
||||
*
|
||||
* When size is set to 1, a vector of length 1 containing 'high' is returned.
|
||||
@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
|
||||
/**
|
||||
* \brief Sets a linearly spaced vector.
|
||||
*
|
||||
* The function fill *this with equally spaced values in the closed interval [low,high].
|
||||
* The function fills *this with equally spaced values in the closed interval [low,high].
|
||||
* When size is set to 1, a vector of length 1 containing 'high' is returned.
|
||||
*
|
||||
* \only_for_vectors
|
||||
|
@ -558,12 +558,15 @@ template<typename Derived> class DenseBase
|
||||
EIGEN_DEVICE_FUNC void reverseInPlace();
|
||||
|
||||
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
|
||||
# include "../plugins/BlockMethods.h"
|
||||
# ifdef EIGEN_DENSEBASE_PLUGIN
|
||||
# include EIGEN_DENSEBASE_PLUGIN
|
||||
# endif
|
||||
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
|
||||
|
||||
#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
|
||||
#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
|
||||
|
||||
// disable the use of evalTo for dense objects with a nice compilation error
|
||||
template<typename Dest>
|
||||
|
@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
|
||||
template<typename Scalar,int Size,int MaxSize>
|
||||
struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
|
||||
{
|
||||
#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
|
||||
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
|
||||
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
|
||||
#else
|
||||
// Some architectures cannot align on the stack,
|
||||
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
|
||||
enum {
|
||||
ForceAlignment = internal::packet_traits<Scalar>::Vectorizable,
|
||||
PacketSize = internal::packet_traits<Scalar>::size
|
||||
};
|
||||
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
|
||||
#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
|
||||
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
|
||||
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
|
||||
#else
|
||||
// Some architectures cannot align on the stack,
|
||||
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
|
||||
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
|
||||
EIGEN_STRONG_INLINE Scalar* data() {
|
||||
return ForceAlignment
|
||||
? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
|
||||
? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
|
||||
: m_data.array;
|
||||
}
|
||||
#endif
|
||||
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
|
||||
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
|
||||
|
||||
ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
|
||||
ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
|
||||
|
@ -50,7 +50,7 @@ public:
|
||||
typedef typename internal::ref_selector<Inverse>::type Nested;
|
||||
typedef typename internal::remove_all<XprType>::type NestedExpression;
|
||||
|
||||
explicit Inverse(const XprType &xpr)
|
||||
explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
|
||||
: m_xpr(xpr)
|
||||
{}
|
||||
|
||||
|
@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
template<typename T>
|
||||
struct real_impl<std::complex<T> >
|
||||
{
|
||||
typedef T RealScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline T run(const std::complex<T>& x)
|
||||
{
|
||||
return x.real();
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename Scalar>
|
||||
struct real_retval
|
||||
{
|
||||
@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
template<typename T>
|
||||
struct imag_impl<std::complex<T> >
|
||||
{
|
||||
typedef T RealScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline T run(const std::complex<T>& x)
|
||||
{
|
||||
return x.imag();
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename Scalar>
|
||||
struct imag_retval
|
||||
{
|
||||
@ -1049,12 +1075,12 @@ double abs(const double &x) { return ::fabs(x); }
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const std::complex<float>& x) {
|
||||
return ::hypotf(real(x), imag(x));
|
||||
return ::hypotf(x.real(), x.imag());
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double abs(const std::complex<double>& x) {
|
||||
return ::hypot(real(x), imag(x));
|
||||
return ::hypot(x.real(), x.imag());
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1312,11 +1338,12 @@ template<typename Scalar>
|
||||
struct scalar_fuzzy_default_impl<Scalar, true, false>
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
template<typename OtherScalar>
|
||||
template<typename OtherScalar> EIGEN_DEVICE_FUNC
|
||||
static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
|
||||
{
|
||||
return numext::abs2(x) <= numext::abs2(y) * prec * prec;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
|
||||
{
|
||||
return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
|
||||
|
@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
|
||||
// this range is +/-1.0f in single-precision.
|
||||
const T plus_9 = pset1<T>(9.f);
|
||||
const T minus_9 = pset1<T>(-9.f);
|
||||
const T x = pmax(minus_9, pmin(plus_9, a_x));
|
||||
|
||||
// NOTE GCC prior to 6.3 might improperly optimize this max/min
|
||||
// step such that if a_x is nan, x will be either 9 or -9,
|
||||
// and tanh will return 1 or -1 instead of nan.
|
||||
// This is supposed to be fixed in gcc6.3,
|
||||
// see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
|
||||
const T x = pmax(minus_9,pmin(plus_9,a_x));
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
const T alpha_1 = pset1<T>(4.89352455891786e-03f);
|
||||
const T alpha_3 = pset1<T>(6.37261928875436e-04f);
|
||||
|
@ -98,7 +98,7 @@ template<typename Derived> class MatrixBase
|
||||
/** \returns the size of the main diagonal, which is min(rows(),cols()).
|
||||
* \sa rows(), cols(), SizeAtCompileTime. */
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
|
||||
inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
|
||||
|
||||
typedef typename Base::PlainObject PlainObject;
|
||||
|
||||
@ -121,6 +121,7 @@ template<typename Derived> class MatrixBase
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
|
||||
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
|
||||
# include "../plugins/CommonCwiseUnaryOps.h"
|
||||
# include "../plugins/CommonCwiseBinaryOps.h"
|
||||
# include "../plugins/MatrixCwiseUnaryOps.h"
|
||||
@ -129,6 +130,7 @@ template<typename Derived> class MatrixBase
|
||||
# include EIGEN_MATRIXBASE_PLUGIN
|
||||
# endif
|
||||
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
|
||||
#undef EIGEN_DOC_UNARY_ADDONS
|
||||
|
||||
/** Special case of the template operator=, in order to prevent the compiler
|
||||
* from generating a default operator= (issue hit with g++ 4.1)
|
||||
@ -328,15 +330,11 @@ template<typename Derived> class MatrixBase
|
||||
|
||||
/////////// LU module ///////////
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const FullPivLU<PlainObject> fullPivLu() const;
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const PartialPivLU<PlainObject> partialPivLu() const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const PartialPivLU<PlainObject> lu() const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Inverse<Derived> inverse() const;
|
||||
|
||||
template<typename ResultType>
|
||||
|
@ -265,7 +265,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
|
||||
// FIXME not very good if rhs is real and lhs complex while alpha is real too
|
||||
const Index cols = dst.cols();
|
||||
for (Index j=0; j<cols; ++j)
|
||||
func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
|
||||
func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
|
||||
}
|
||||
|
||||
// Row major result
|
||||
@ -278,7 +278,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
|
||||
// FIXME not very good if lhs is real and rhs complex while alpha is real too
|
||||
const Index rows = dst.rows();
|
||||
for (Index i=0; i<rows; ++i)
|
||||
func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
|
||||
func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
|
||||
}
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
@ -437,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
|
||||
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
|
||||
#if 0
|
||||
std::cerr << "LhsOuterStrideBytes= " << LhsOuterStrideBytes << "\n";
|
||||
std::cerr << "RhsOuterStrideBytes= " << RhsOuterStrideBytes << "\n";
|
||||
std::cerr << "LhsAlignment= " << LhsAlignment << "\n";
|
||||
std::cerr << "RhsAlignment= " << RhsAlignment << "\n";
|
||||
std::cerr << "CanVectorizeLhs= " << CanVectorizeLhs << "\n";
|
||||
std::cerr << "CanVectorizeRhs= " << CanVectorizeRhs << "\n";
|
||||
std::cerr << "CanVectorizeInner= " << CanVectorizeInner << "\n";
|
||||
std::cerr << "EvalToRowMajor= " << EvalToRowMajor << "\n";
|
||||
std::cerr << "Alignment= " << Alignment << "\n";
|
||||
std::cerr << "Flags= " << Flags << "\n";
|
||||
#endif
|
||||
}
|
||||
|
||||
// Everything below here is taken from CoeffBasedProduct.h
|
||||
@ -503,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
|
||||
RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
|
||||
|
||||
Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
|
||||
: bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
|
||||
Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
|
||||
: bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
|
||||
: 0,
|
||||
|
||||
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
|
||||
@ -590,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
|
||||
{
|
||||
etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
|
||||
}
|
||||
};
|
||||
|
||||
@ -600,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
|
||||
{
|
||||
etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
|
||||
res = pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
|
||||
res = pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
|
||||
}
|
||||
};
|
||||
|
||||
@ -609,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
|
||||
{
|
||||
res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
|
||||
res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
|
||||
}
|
||||
};
|
||||
|
||||
@ -618,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
|
||||
{
|
||||
res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
|
||||
res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
|
||||
}
|
||||
};
|
||||
|
||||
@ -627,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
|
||||
{
|
||||
res = pset1<Packet>(0);
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
};
|
||||
|
||||
@ -636,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
|
||||
{
|
||||
res = pset1<Packet>(0);
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
}
|
||||
};
|
||||
|
||||
@ -645,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
|
||||
{
|
||||
res = pset1<Packet>(0);
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
for(Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
|
||||
}
|
||||
@ -656,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
|
||||
{
|
||||
res = pset1<Packet>(0);
|
||||
res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
|
||||
for(Index i = 0; i < innerDim; ++i)
|
||||
res = pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
|
||||
}
|
||||
|
103
Eigen/src/Core/arch/CUDA/Complex.h
Normal file
103
Eigen/src/Core/arch/CUDA/Complex.h
Normal file
@ -0,0 +1,103 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_COMPLEX_CUDA_H
|
||||
#define EIGEN_COMPLEX_CUDA_H
|
||||
|
||||
// clang-format off
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Many std::complex methods such as operator+, operator-, operator* and
|
||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||
// functions and thus Eigen functors making use of these operators fail to
|
||||
// compile. Here, we manually specialize these functors for complex types when
|
||||
// building for CUDA to avoid non-constexpr methods.
|
||||
|
||||
// Sum
|
||||
template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
|
||||
return std::complex<T>(numext::real(a) + numext::real(b),
|
||||
numext::imag(a) + numext::imag(b));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
|
||||
|
||||
|
||||
// Difference
|
||||
template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
|
||||
return std::complex<T>(numext::real(a) - numext::real(b),
|
||||
numext::imag(a) - numext::imag(b));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
|
||||
|
||||
|
||||
// Product
|
||||
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasMul
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
|
||||
const T a_real = numext::real(a);
|
||||
const T a_imag = numext::imag(a);
|
||||
const T b_real = numext::real(b);
|
||||
const T b_imag = numext::imag(b);
|
||||
return std::complex<T>(a_real * b_real - a_imag * b_imag,
|
||||
a_real * b_imag + a_imag * b_real);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
|
||||
|
||||
|
||||
// Quotient
|
||||
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasDiv
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
|
||||
const T a_real = numext::real(a);
|
||||
const T a_imag = numext::imag(a);
|
||||
const T b_real = numext::real(b);
|
||||
const T b_imag = numext::imag(b);
|
||||
const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
|
||||
return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
|
||||
(a_imag * b_real - a_real * b_imag) * norm);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_COMPLEX_CUDA_H
|
@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
|
||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
return __half2half2(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
return *reinterpret_cast<const half2*>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
*reinterpret_cast<half2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
return __ldg((const half2*)from);
|
||||
#else
|
||||
@ -76,7 +76,7 @@ template<>
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
half2 result;
|
||||
result.x = a.x & 0x7FFF7FFF;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
__device__ EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
__half a1 = __low2half(kernel.packet[0]);
|
||||
__half a2 = __high2half(kernel.packet[0]);
|
||||
@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
kernel.packet[1] = __halves2half2(a2, b2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hneg2(a);
|
||||
#else
|
||||
@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = log1pf(a1);
|
||||
@ -277,29 +277,29 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
return h2log(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 pexp<half2>(const half2& a) {
|
||||
return h2exp(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 psqrt<half2>(const half2& a) {
|
||||
return h2sqrt(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 prsqrt<half2>(const half2& a) {
|
||||
return h2rsqrt(a);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = logf(a1);
|
||||
@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expf(a1);
|
||||
@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = sqrtf(a1);
|
||||
@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = rsqrtf(a1);
|
||||
|
@ -434,15 +434,16 @@ public:
|
||||
template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
|
||||
EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
|
||||
{
|
||||
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
|
||||
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
|
||||
// let gcc allocate the register in which to store the result of the pmul
|
||||
// (in the case where there is no FMA) gcc fails to figure out how to avoid
|
||||
// spilling register.
|
||||
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
EIGEN_UNUSED_VARIABLE(tmp);
|
||||
c = pmadd(a,b,c);
|
||||
c = cj.pmadd(a,b,c);
|
||||
#else
|
||||
tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
|
||||
tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -457,9 +458,6 @@ public:
|
||||
r = pmadd(c,alpha,r);
|
||||
}
|
||||
|
||||
protected:
|
||||
// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
|
||||
// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
|
||||
};
|
||||
|
||||
template<typename RealScalar, bool _ConjLhs>
|
||||
|
@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
|
||||
{
|
||||
typedef typename Dest::Scalar ResScalar;
|
||||
typedef typename Rhs::Scalar RhsScalar;
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
|
||||
|
||||
eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
|
||||
|
||||
|
@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
|
||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
|
||||
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
|
||||
|
||||
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
|
||||
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
|
||||
|
@ -14,12 +14,13 @@
|
||||
// 4512 - assignment operator could not be generated
|
||||
// 4522 - 'class' : multiple assignment operators specified
|
||||
// 4700 - uninitialized local variable 'xyz' used
|
||||
// 4714 - function marked as __forceinline not inlined
|
||||
// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
|
||||
// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
|
||||
#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
|
||||
#pragma warning( push )
|
||||
#endif
|
||||
#pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
|
||||
#pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
|
||||
|
||||
#elif defined __INTEL_COMPILER
|
||||
// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
|
||||
@ -67,6 +68,8 @@
|
||||
#pragma diag_suppress 2669
|
||||
#pragma diag_suppress 2670
|
||||
#pragma diag_suppress 2671
|
||||
#pragma diag_suppress 2735
|
||||
#pragma diag_suppress 2737
|
||||
#endif
|
||||
|
||||
#endif // not EIGEN_WARNINGS_DISABLED
|
||||
|
@ -13,7 +13,7 @@
|
||||
|
||||
#define EIGEN_WORLD_VERSION 3
|
||||
#define EIGEN_MAJOR_VERSION 2
|
||||
#define EIGEN_MINOR_VERSION 93
|
||||
#define EIGEN_MINOR_VERSION 94
|
||||
|
||||
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
|
||||
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
|
||||
@ -954,8 +954,8 @@ namespace Eigen {
|
||||
# define EIGEN_CATCH(X) catch (X)
|
||||
#else
|
||||
# ifdef __CUDA_ARCH__
|
||||
# define EIGEN_THROW_X(X) asm("trap;") return {}
|
||||
# define EIGEN_THROW asm("trap;"); return {}
|
||||
# define EIGEN_THROW_X(X) asm("trap;")
|
||||
# define EIGEN_THROW asm("trap;")
|
||||
# else
|
||||
# define EIGEN_THROW_X(X) std::abort()
|
||||
# define EIGEN_THROW std::abort()
|
||||
|
@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
|
||||
destruct_elements_of_array(ptr, i);
|
||||
EIGEN_THROW;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
|
||||
aligned_free(result);
|
||||
EIGEN_THROW;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
|
||||
@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
|
||||
conditional_aligned_free<Align>(result);
|
||||
EIGEN_THROW;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** \internal Deletes objects constructed with aligned_new
|
||||
|
@ -671,6 +671,14 @@ struct scalar_div_cost {
|
||||
enum { value = 8*NumTraits<T>::MulCost };
|
||||
};
|
||||
|
||||
template<typename T,bool Vectorized>
|
||||
struct scalar_div_cost<std::complex<T>, Vectorized> {
|
||||
enum { value = 2*scalar_div_cost<T>::value
|
||||
+ 6*NumTraits<T>::MulCost
|
||||
+ 3*NumTraits<T>::AddCost
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<bool Vectorized>
|
||||
struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
|
||||
|
@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
|
||||
typedef AngleAxis<double> AngleAxisd;
|
||||
|
||||
/** Set \c *this from a \b unit quaternion.
|
||||
* The resulting axis is normalized.
|
||||
*
|
||||
* The resulting axis is normalized, and the computed angle is in the [0,pi] range.
|
||||
*
|
||||
* This function implicitly normalizes the quaternion \a q.
|
||||
*/
|
||||
@ -167,12 +168,16 @@ template<typename QuatDerived>
|
||||
AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
|
||||
{
|
||||
using std::atan2;
|
||||
using std::abs;
|
||||
Scalar n = q.vec().norm();
|
||||
if(n<NumTraits<Scalar>::epsilon())
|
||||
n = q.vec().stableNorm();
|
||||
if (n > Scalar(0))
|
||||
|
||||
if (n != Scalar(0))
|
||||
{
|
||||
m_angle = Scalar(2)*atan2(n, q.w());
|
||||
m_angle = Scalar(2)*atan2(n, abs(q.w()));
|
||||
if(q.w() < 0)
|
||||
n = -n;
|
||||
m_axis = q.vec() / n;
|
||||
}
|
||||
else
|
||||
|
@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
|
||||
res[0] = atan2(coeff(j,i), coeff(k,i));
|
||||
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
|
||||
{
|
||||
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
|
||||
if(res[0] > Scalar(0)) {
|
||||
res[0] -= Scalar(EIGEN_PI);
|
||||
}
|
||||
else {
|
||||
res[0] += Scalar(EIGEN_PI);
|
||||
}
|
||||
Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
|
||||
res[1] = -atan2(s2, coeff(i,i));
|
||||
}
|
||||
@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
|
||||
res[0] = atan2(coeff(j,k), coeff(k,k));
|
||||
Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
|
||||
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
|
||||
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
|
||||
if(res[0] > Scalar(0)) {
|
||||
res[0] -= Scalar(EIGEN_PI);
|
||||
}
|
||||
else {
|
||||
res[0] += Scalar(EIGEN_PI);
|
||||
}
|
||||
res[1] = atan2(-coeff(i,k), -c2);
|
||||
}
|
||||
else
|
||||
|
@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: the following specialization is to address a regression from 3.2 to 3.3
|
||||
// In the future, this path should be optimized.
|
||||
template<typename Lhs, typename RhsArg, int ProductTag>
|
||||
struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
|
||||
{
|
||||
template<typename Dest>
|
||||
static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
|
||||
{
|
||||
dst.noalias() = lhs * rhs.eval();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Lhs,typename Rhs>
|
||||
struct homogeneous_left_product_refactoring_helper
|
||||
{
|
||||
|
@ -464,7 +464,7 @@ public:
|
||||
operator * (const DiagonalBase<DiagonalDerived> &b) const
|
||||
{
|
||||
TransformTimeDiagonalReturnType res(*this);
|
||||
res.linear() *= b;
|
||||
res.linearExt() *= b;
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -578,7 +578,7 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
|
||||
inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
|
||||
|
||||
template<typename Derived>
|
||||
inline Transform& operator=(const RotationBase<Derived,Dim>& r);
|
||||
@ -853,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
|
||||
EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
|
||||
m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
|
||||
affine().noalias() = (other.asDiagonal() * affine());
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -130,8 +130,10 @@ public:
|
||||
}
|
||||
|
||||
/** Applies translation to vector */
|
||||
inline VectorType operator* (const VectorType& other) const
|
||||
{ return m_coeffs + other; }
|
||||
template<typename Derived>
|
||||
inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
|
||||
operator* (const MatrixBase<Derived>& vec) const
|
||||
{ return m_coeffs + vec.derived(); }
|
||||
|
||||
/** \returns the inverse translation (opposite) */
|
||||
Translation inverse() const { return Translation(-m_coeffs); }
|
||||
|
@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
|
||||
{
|
||||
*this *= Scalar(1)-tau;
|
||||
}
|
||||
else
|
||||
else if(tau!=Scalar(0))
|
||||
{
|
||||
Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
|
||||
Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
|
||||
@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
|
||||
{
|
||||
*this *= Scalar(1)-tau;
|
||||
}
|
||||
else
|
||||
else if(tau!=Scalar(0))
|
||||
{
|
||||
Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
|
||||
Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
|
||||
|
@ -156,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
|
||||
*
|
||||
* \sa permutationQ()
|
||||
*/
|
||||
inline const PermutationPType& permutationP() const
|
||||
EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LU is not initialized.");
|
||||
return m_p;
|
||||
@ -406,8 +406,8 @@ template<typename _MatrixType> class FullPivLU
|
||||
|
||||
MatrixType reconstructedMatrix() const;
|
||||
|
||||
inline Index rows() const { return m_lu.rows(); }
|
||||
inline Index cols() const { return m_lu.cols(); }
|
||||
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
|
||||
EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename RhsType, typename DstType>
|
||||
|
@ -665,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
|
||||
// only worsening the precision of U and V as we accumulate more rotations
|
||||
const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
|
||||
|
||||
// limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
|
||||
// FIXME What about considerering any denormal numbers as zero, using:
|
||||
// const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||
const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
|
||||
// limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
|
||||
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||
|
||||
// Scaling factor to reduce over/under-flows
|
||||
RealScalar scale = matrix.cwiseAbs().maxCoeff();
|
||||
|
@ -141,6 +141,15 @@ template<typename Derived> class SparseMatrixBase
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
|
||||
#ifdef EIGEN_PARSED_BY_DOXYGEN
|
||||
#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP) /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
|
||||
#else
|
||||
#define EIGEN_DOC_UNARY_ADDONS(X,Y)
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
|
||||
#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
|
||||
#endif
|
||||
# include "../plugins/CommonCwiseUnaryOps.h"
|
||||
# include "../plugins/CommonCwiseBinaryOps.h"
|
||||
# include "../plugins/MatrixCwiseUnaryOps.h"
|
||||
@ -149,8 +158,10 @@ template<typename Derived> class SparseMatrixBase
|
||||
# ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
|
||||
# include EIGEN_SPARSEMATRIXBASE_PLUGIN
|
||||
# endif
|
||||
# undef EIGEN_CURRENT_STORAGE_BASE_CLASS
|
||||
#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
|
||||
#undef EIGEN_DOC_UNARY_ADDONS
|
||||
#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
|
||||
#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
|
||||
|
||||
/** \returns the number of rows. \sa cols() */
|
||||
inline Index rows() const { return derived().rows(); }
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -36,8 +36,10 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati
|
||||
|
||||
#endif // not EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
/** \returns an expression of the opposite of \c *this
|
||||
*/
|
||||
/// \returns an expression of the opposite of \c *this
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const NegativeReturnType
|
||||
operator-() const { return NegativeReturnType(derived()); }
|
||||
@ -45,13 +47,15 @@ operator-() const { return NegativeReturnType(derived()); }
|
||||
|
||||
template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
|
||||
|
||||
/** \returns an expression of *this with the \a Scalar type casted to
|
||||
* \a NewScalar.
|
||||
*
|
||||
* The template parameter \a NewScalar is the type we are casting the scalars to.
|
||||
*
|
||||
* \sa class CwiseUnaryOp
|
||||
*/
|
||||
/// \returns an expression of \c *this with the \a Scalar type casted to
|
||||
/// \a NewScalar.
|
||||
///
|
||||
/// The template parameter \a NewScalar is the type we are casting the scalars to.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
|
||||
///
|
||||
/// \sa class CwiseUnaryOp
|
||||
///
|
||||
template<typename NewType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
typename CastXpr<NewType>::Type
|
||||
@ -60,9 +64,11 @@ cast() const
|
||||
return typename CastXpr<NewType>::Type(derived());
|
||||
}
|
||||
|
||||
/** \returns an expression of the complex conjugate of \c *this.
|
||||
*
|
||||
* \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */
|
||||
/// \returns an expression of the complex conjugate of \c *this.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
|
||||
///
|
||||
/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline ConjugateReturnType
|
||||
conjugate() const
|
||||
@ -70,39 +76,45 @@ conjugate() const
|
||||
return ConjugateReturnType(derived());
|
||||
}
|
||||
|
||||
/** \returns a read-only expression of the real part of \c *this.
|
||||
*
|
||||
* \sa imag() */
|
||||
/// \returns a read-only expression of the real part of \c *this.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(real,real part function)
|
||||
///
|
||||
/// \sa imag()
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline RealReturnType
|
||||
real() const { return RealReturnType(derived()); }
|
||||
|
||||
/** \returns an read-only expression of the imaginary part of \c *this.
|
||||
*
|
||||
* \sa real() */
|
||||
/// \returns an read-only expression of the imaginary part of \c *this.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
|
||||
///
|
||||
/// \sa real()
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const ImagReturnType
|
||||
imag() const { return ImagReturnType(derived()); }
|
||||
|
||||
/** \brief Apply a unary operator coefficient-wise
|
||||
* \param[in] func Functor implementing the unary operator
|
||||
* \tparam CustomUnaryOp Type of \a func
|
||||
* \returns An expression of a custom coefficient-wise unary operator \a func of *this
|
||||
*
|
||||
* The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
|
||||
*
|
||||
* Example:
|
||||
* \include class_CwiseUnaryOp_ptrfun.cpp
|
||||
* Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
|
||||
*
|
||||
* Genuine functors allow for more possibilities, for instance it may contain a state.
|
||||
*
|
||||
* Example:
|
||||
* \include class_CwiseUnaryOp.cpp
|
||||
* Output: \verbinclude class_CwiseUnaryOp.out
|
||||
*
|
||||
* \sa class CwiseUnaryOp, class CwiseBinaryOp
|
||||
*/
|
||||
/// \brief Apply a unary operator coefficient-wise
|
||||
/// \param[in] func Functor implementing the unary operator
|
||||
/// \tparam CustomUnaryOp Type of \a func
|
||||
/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
|
||||
///
|
||||
/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
|
||||
///
|
||||
/// Example:
|
||||
/// \include class_CwiseUnaryOp_ptrfun.cpp
|
||||
/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
|
||||
///
|
||||
/// Genuine functors allow for more possibilities, for instance it may contain a state.
|
||||
///
|
||||
/// Example:
|
||||
/// \include class_CwiseUnaryOp.cpp
|
||||
/// Output: \verbinclude class_CwiseUnaryOp.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
|
||||
///
|
||||
/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
|
||||
///
|
||||
template<typename CustomUnaryOp>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
|
||||
@ -111,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
|
||||
return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
|
||||
}
|
||||
|
||||
/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
|
||||
*
|
||||
* The template parameter \a CustomUnaryOp is the type of the functor
|
||||
* of the custom unary operator.
|
||||
*
|
||||
* Example:
|
||||
* \include class_CwiseUnaryOp.cpp
|
||||
* Output: \verbinclude class_CwiseUnaryOp.out
|
||||
*
|
||||
* \sa class CwiseUnaryOp, class CwiseBinaryOp
|
||||
*/
|
||||
/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
|
||||
///
|
||||
/// The template parameter \a CustomUnaryOp is the type of the functor
|
||||
/// of the custom unary operator.
|
||||
///
|
||||
/// Example:
|
||||
/// \include class_CwiseUnaryOp.cpp
|
||||
/// Output: \verbinclude class_CwiseUnaryOp.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
|
||||
///
|
||||
/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
|
||||
///
|
||||
template<typename CustomViewOp>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseUnaryView<CustomViewOp, const Derived>
|
||||
@ -130,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
|
||||
return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
|
||||
}
|
||||
|
||||
/** \returns a non const expression of the real part of \c *this.
|
||||
*
|
||||
* \sa imag() */
|
||||
/// \returns a non const expression of the real part of \c *this.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(real,real part function)
|
||||
///
|
||||
/// \sa imag()
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline NonConstRealReturnType
|
||||
real() { return NonConstRealReturnType(derived()); }
|
||||
|
||||
/** \returns a non const expression of the imaginary part of \c *this.
|
||||
*
|
||||
* \sa real() */
|
||||
/// \returns a non const expression of the imaginary part of \c *this.
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
|
||||
///
|
||||
/// \sa real()
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline NonConstImagReturnType
|
||||
imag() { return NonConstImagReturnType(derived()); }
|
||||
|
@ -11,63 +11,75 @@
|
||||
// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
|
||||
// This include MatrixBase and SparseMatrixBase.
|
||||
|
||||
|
||||
typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
|
||||
typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
|
||||
typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
|
||||
typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
|
||||
typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
|
||||
|
||||
/** \returns an expression of the coefficient-wise absolute value of \c *this
|
||||
*
|
||||
* Example: \include MatrixBase_cwiseAbs.cpp
|
||||
* Output: \verbinclude MatrixBase_cwiseAbs.out
|
||||
*
|
||||
* \sa cwiseAbs2()
|
||||
*/
|
||||
/// \returns an expression of the coefficient-wise absolute value of \c *this
|
||||
///
|
||||
/// Example: \include MatrixBase_cwiseAbs.cpp
|
||||
/// Output: \verbinclude MatrixBase_cwiseAbs.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
|
||||
///
|
||||
/// \sa cwiseAbs2()
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const CwiseAbsReturnType
|
||||
cwiseAbs() const { return CwiseAbsReturnType(derived()); }
|
||||
|
||||
/** \returns an expression of the coefficient-wise squared absolute value of \c *this
|
||||
*
|
||||
* Example: \include MatrixBase_cwiseAbs2.cpp
|
||||
* Output: \verbinclude MatrixBase_cwiseAbs2.out
|
||||
*
|
||||
* \sa cwiseAbs()
|
||||
*/
|
||||
/// \returns an expression of the coefficient-wise squared absolute value of \c *this
|
||||
///
|
||||
/// Example: \include MatrixBase_cwiseAbs2.cpp
|
||||
/// Output: \verbinclude MatrixBase_cwiseAbs2.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
|
||||
///
|
||||
/// \sa cwiseAbs()
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
|
||||
cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
|
||||
|
||||
/** \returns an expression of the coefficient-wise square root of *this.
|
||||
*
|
||||
* Example: \include MatrixBase_cwiseSqrt.cpp
|
||||
* Output: \verbinclude MatrixBase_cwiseSqrt.out
|
||||
*
|
||||
* \sa cwisePow(), cwiseSquare()
|
||||
*/
|
||||
/// \returns an expression of the coefficient-wise square root of *this.
|
||||
///
|
||||
/// Example: \include MatrixBase_cwiseSqrt.cpp
|
||||
/// Output: \verbinclude MatrixBase_cwiseSqrt.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
|
||||
///
|
||||
/// \sa cwisePow(), cwiseSquare()
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseSqrtReturnType
|
||||
cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
|
||||
|
||||
/** \returns an expression of the coefficient-wise signum of *this.
|
||||
*
|
||||
* Example: \include MatrixBase_cwiseSign.cpp
|
||||
* Output: \verbinclude MatrixBase_cwiseSign.out
|
||||
*
|
||||
*/
|
||||
/// \returns an expression of the coefficient-wise signum of *this.
|
||||
///
|
||||
/// Example: \include MatrixBase_cwiseSign.cpp
|
||||
/// Output: \verbinclude MatrixBase_cwiseSign.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseSignReturnType
|
||||
cwiseSign() const { return CwiseSignReturnType(derived()); }
|
||||
|
||||
|
||||
/** \returns an expression of the coefficient-wise inverse of *this.
|
||||
*
|
||||
* Example: \include MatrixBase_cwiseInverse.cpp
|
||||
* Output: \verbinclude MatrixBase_cwiseInverse.out
|
||||
*
|
||||
* \sa cwiseProduct()
|
||||
*/
|
||||
/// \returns an expression of the coefficient-wise inverse of *this.
|
||||
///
|
||||
/// Example: \include MatrixBase_cwiseInverse.cpp
|
||||
/// Output: \verbinclude MatrixBase_cwiseInverse.out
|
||||
///
|
||||
EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
|
||||
///
|
||||
/// \sa cwiseProduct()
|
||||
///
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseInverseReturnType
|
||||
cwiseInverse() const { return CwiseInverseReturnType(derived()); }
|
||||
|
||||
|
||||
|
@ -1,10 +1,13 @@
|
||||
|
||||
find_package(BLAZE)
|
||||
find_package(Boost)
|
||||
find_package(Boost COMPONENTS system)
|
||||
if (BLAZE_FOUND AND Boost_FOUND)
|
||||
include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
|
||||
btl_add_bench(btl_blaze main.cpp)
|
||||
# Note: The newest blaze version requires C++14.
|
||||
# Ideally, we should set this depending on the version of Blaze we found
|
||||
set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
|
||||
if(BUILD_btl_blaze)
|
||||
target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
|
||||
target_link_libraries(btl_blaze ${Boost_LIBRARIES})
|
||||
endif()
|
||||
endif ()
|
||||
|
@ -53,6 +53,33 @@ showing that the program works as expected:
|
||||
|
||||
This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
|
||||
|
||||
|
||||
\section NullaryExpr_Indexing Example 2: indexing rows and columns
|
||||
|
||||
The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
|
||||
|
||||
\snippet nullary_indexing.out main1
|
||||
|
||||
To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
|
||||
|
||||
\snippet nullary_indexing.cpp functor
|
||||
|
||||
Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
|
||||
|
||||
\snippet nullary_indexing.cpp function
|
||||
|
||||
Finally, here is an example of how this function can be used:
|
||||
|
||||
\snippet nullary_indexing.cpp main1
|
||||
|
||||
This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
|
||||
|
||||
\snippet nullary_indexing.cpp main2
|
||||
|
||||
and the output is:
|
||||
|
||||
\snippet nullary_indexing.out main2
|
||||
|
||||
*/
|
||||
|
||||
}
|
||||
|
@ -1612,7 +1612,10 @@ EXPAND_AS_DEFINED = EIGEN_MAKE_TYPEDEFS \
|
||||
EIGEN_EMPTY \
|
||||
EIGEN_EULER_ANGLES_TYPEDEFS \
|
||||
EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
|
||||
EIGEN_EULER_SYSTEM_TYPEDEF
|
||||
EIGEN_EULER_SYSTEM_TYPEDEF \
|
||||
EIGEN_DOC_UNARY_ADDONS \
|
||||
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
|
||||
EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
|
||||
|
||||
# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
|
||||
# doxygen's preprocessor will remove all references to function-like macros
|
||||
|
@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
|
||||
)
|
||||
add_dependencies(all_examples ${example})
|
||||
endforeach(example_src)
|
||||
|
||||
check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
|
||||
if(EIGEN_COMPILER_SUPPORT_CPP11)
|
||||
ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
|
||||
endif()
|
52
doc/examples/make_circulant2.cpp
Normal file
52
doc/examples/make_circulant2.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <Eigen/Core>
|
||||
#include <iostream>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// [circulant_func]
|
||||
template<class ArgType>
|
||||
class circulant_functor {
|
||||
const ArgType &m_vec;
|
||||
public:
|
||||
circulant_functor(const ArgType& arg) : m_vec(arg) {}
|
||||
|
||||
const typename ArgType::Scalar& operator() (Index row, Index col) const {
|
||||
Index index = row - col;
|
||||
if (index < 0) index += m_vec.size();
|
||||
return m_vec(index);
|
||||
}
|
||||
};
|
||||
// [circulant_func]
|
||||
|
||||
// [square]
|
||||
template<class ArgType>
|
||||
struct circulant_helper {
|
||||
typedef Matrix<typename ArgType::Scalar,
|
||||
ArgType::SizeAtCompileTime,
|
||||
ArgType::SizeAtCompileTime,
|
||||
ColMajor,
|
||||
ArgType::MaxSizeAtCompileTime,
|
||||
ArgType::MaxSizeAtCompileTime> MatrixType;
|
||||
};
|
||||
// [square]
|
||||
|
||||
// [makeCirculant]
|
||||
template <class ArgType>
|
||||
CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
|
||||
makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
|
||||
{
|
||||
typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
|
||||
return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
|
||||
}
|
||||
// [makeCirculant]
|
||||
|
||||
// [main]
|
||||
int main()
|
||||
{
|
||||
Eigen::VectorXd vec(4);
|
||||
vec << 1, 2, 4, 8;
|
||||
Eigen::MatrixXd mat;
|
||||
mat = makeCirculant(vec);
|
||||
std::cout << mat << std::endl;
|
||||
}
|
||||
// [main]
|
66
doc/examples/nullary_indexing.cpp
Normal file
66
doc/examples/nullary_indexing.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
#include <Eigen/Core>
|
||||
#include <iostream>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// [functor]
|
||||
template<class ArgType, class RowIndexType, class ColIndexType>
|
||||
class indexing_functor {
|
||||
const ArgType &m_arg;
|
||||
const RowIndexType &m_rowIndices;
|
||||
const ColIndexType &m_colIndices;
|
||||
public:
|
||||
typedef Matrix<typename ArgType::Scalar,
|
||||
RowIndexType::SizeAtCompileTime,
|
||||
ColIndexType::SizeAtCompileTime,
|
||||
ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
|
||||
RowIndexType::MaxSizeAtCompileTime,
|
||||
ColIndexType::MaxSizeAtCompileTime> MatrixType;
|
||||
|
||||
indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
|
||||
: m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
|
||||
{}
|
||||
|
||||
const typename ArgType::Scalar& operator() (Index row, Index col) const {
|
||||
return m_arg(m_rowIndices[row], m_colIndices[col]);
|
||||
}
|
||||
};
|
||||
// [functor]
|
||||
|
||||
// [function]
|
||||
template <class ArgType, class RowIndexType, class ColIndexType>
|
||||
CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
|
||||
indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
|
||||
{
|
||||
typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
|
||||
typedef typename Func::MatrixType MatrixType;
|
||||
return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
|
||||
}
|
||||
// [function]
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
std::cout << "[main1]\n";
|
||||
Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
|
||||
Array3i ri(1,2,1);
|
||||
ArrayXi ci(6); ci << 3,2,1,0,0,2;
|
||||
Eigen::MatrixXi B = indexing(A, ri, ci);
|
||||
std::cout << "A =" << std::endl;
|
||||
std::cout << A << std::endl << std::endl;
|
||||
std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
|
||||
std::cout << B << std::endl;
|
||||
std::cout << "[main1]\n";
|
||||
|
||||
std::cout << "[main2]\n";
|
||||
B = indexing(A, ri+1, ci);
|
||||
std::cout << "A(ri+1,ci) =" << std::endl;
|
||||
std::cout << B << std::endl << std::endl;
|
||||
#if __cplusplus >= 201103L
|
||||
B = indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
|
||||
std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
|
||||
std::cout << B << std::endl << std::endl;
|
||||
#endif
|
||||
std::cout << "[main2]\n";
|
||||
}
|
||||
|
@ -355,7 +355,7 @@ if(CUDA_FOUND)
|
||||
|
||||
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||
set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
|
||||
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
|
||||
endif()
|
||||
if(EIGEN_TEST_CUDA_CLANG)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
|
||||
|
@ -417,6 +417,7 @@ void cholesky_faillure_cases()
|
||||
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
|
||||
VERIFY(ldlt.info()==NumericalIssue);
|
||||
}
|
||||
#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
|
||||
{
|
||||
mat.resize(3,3);
|
||||
mat << -1, -3, 3,
|
||||
@ -426,6 +427,7 @@ void cholesky_faillure_cases()
|
||||
VERIFY(ldlt.info()==NumericalIssue);
|
||||
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
|
||||
}
|
||||
#endif
|
||||
{
|
||||
mat.resize(3,3);
|
||||
mat << 1, 2, 3,
|
||||
|
@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
|
||||
VERIFY( !m.allFinite() );
|
||||
VERIFY( m.hasNaN() );
|
||||
}
|
||||
m(4) /= T(0.0);
|
||||
T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
|
||||
m(4) /= hidden_zero;
|
||||
if(dryrun)
|
||||
{
|
||||
std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
|
||||
|
@ -111,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
|
||||
|
||||
VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
|
||||
VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
|
||||
|
||||
VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
|
||||
}
|
||||
|
||||
void test_geo_homogeneous()
|
||||
|
@ -334,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
|
||||
t0.scale(v0);
|
||||
t1 *= AlignedScaling3(v0);
|
||||
VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
|
||||
t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
|
||||
t1 = t1 * v0.asDiagonal();
|
||||
VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
|
||||
// transformation * translation
|
||||
t0.translate(v0);
|
||||
t1 = t1 * Translation3(v0);
|
||||
@ -482,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
|
||||
Rotation2D<Scalar> r2(r1); // copy ctor
|
||||
VERIFY_IS_APPROX(r2.angle(),s0);
|
||||
}
|
||||
|
||||
{
|
||||
Transform3 t32(Matrix4::Random()), t33, t34;
|
||||
t34 = t33 = t32;
|
||||
t32.scale(v0);
|
||||
t33*=AlignedScaling3(v0);
|
||||
VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
|
||||
t33 = t34 * AlignedScaling3(v0);
|
||||
VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
|
||||
void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
|
||||
{
|
||||
VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
|
||||
VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
|
||||
VERIFY_IS_APPROX( q*(p*h).hnormalized(), ((q*p)*h).hnormalized() );
|
||||
}
|
||||
|
||||
template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
|
||||
void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
|
||||
{
|
||||
VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
|
||||
VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
|
||||
VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
|
||||
|
||||
transform_associativity_left(a1, a2,p, q, v, h);
|
||||
}
|
||||
|
||||
template<typename Scalar, int Dim, int Options,typename RotationType>
|
||||
void transform_associativity(const RotationType& R)
|
||||
{
|
||||
typedef Matrix<Scalar,Dim,1> VectorType;
|
||||
typedef Matrix<Scalar,Dim+1,1> HVectorType;
|
||||
typedef Matrix<Scalar,Dim,Dim> LinearType;
|
||||
typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
|
||||
typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
|
||||
typedef Transform<Scalar,Dim,Affine,Options> AffineType;
|
||||
typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
|
||||
typedef DiagonalMatrix<Scalar,Dim> ScalingType;
|
||||
typedef Translation<Scalar,Dim> TranslationType;
|
||||
|
||||
AffineCompactType A1c; A1c.matrix().setRandom();
|
||||
AffineCompactType A2c; A2c.matrix().setRandom();
|
||||
AffineType A1(A1c);
|
||||
AffineType A2(A2c);
|
||||
ProjectiveType P1; P1.matrix().setRandom();
|
||||
VectorType v1 = VectorType::Random();
|
||||
VectorType v2 = VectorType::Random();
|
||||
HVectorType h1 = HVectorType::Random();
|
||||
Scalar s1 = internal::random<Scalar>();
|
||||
LinearType L = LinearType::Random();
|
||||
MatrixType M = MatrixType::Random();
|
||||
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
|
||||
CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
|
||||
|
||||
VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
|
||||
VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
|
||||
VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
|
||||
|
||||
VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
|
||||
VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
|
||||
VERIFY_IS_APPROX( M*(P1*h1), ((M*P1)*h1) );
|
||||
}
|
||||
|
||||
template<typename Scalar> void transform_alignment()
|
||||
@ -562,5 +638,8 @@ void test_geo_transformations()
|
||||
|
||||
CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
|
||||
CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
|
||||
|
||||
CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
|
||||
CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
|
||||
}
|
||||
}
|
||||
|
@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
|
||||
}
|
||||
|
||||
if (PacketTraits::HasTanh) {
|
||||
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
|
||||
data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
|
||||
packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
|
||||
h.store(data2, internal::ptanh(h.load(data1)));
|
||||
|
@ -256,6 +256,51 @@ Index compute_block_size()
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<int>
|
||||
void bug_1308()
|
||||
{
|
||||
int n = 10;
|
||||
MatrixXd r(n,n);
|
||||
VectorXd v = VectorXd::Random(n);
|
||||
r = v * RowVectorXd::Ones(n);
|
||||
VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
|
||||
r = VectorXd::Ones(n) * v.transpose();
|
||||
VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
|
||||
|
||||
Matrix4d ones44 = Matrix4d::Ones();
|
||||
Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
|
||||
VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
|
||||
|
||||
typedef Matrix<double,4,4,RowMajor> RMatrix4d;
|
||||
RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
|
||||
VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
|
||||
VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
|
||||
|
||||
// RowVector4d r4;
|
||||
m44.setOnes();
|
||||
r44.setZero();
|
||||
VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
|
||||
r44.setZero();
|
||||
VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
|
||||
r44.setZero();
|
||||
VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
|
||||
r44.setZero();
|
||||
VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
|
||||
}
|
||||
|
||||
void test_product_extra()
|
||||
{
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
@ -268,8 +313,10 @@ void test_product_extra()
|
||||
}
|
||||
CALL_SUBTEST_5( bug_127<0>() );
|
||||
CALL_SUBTEST_5( bug_817<0>() );
|
||||
CALL_SUBTEST_5( bug_1308<0>() );
|
||||
CALL_SUBTEST_6( unaligned_objects<0>() );
|
||||
CALL_SUBTEST_7( compute_block_size<float>() );
|
||||
CALL_SUBTEST_7( compute_block_size<double>() );
|
||||
CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
|
||||
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <Eigen/LU>
|
||||
|
||||
// regression test for bug 447
|
||||
template<int>
|
||||
void product1x1()
|
||||
{
|
||||
Matrix<float,1,3> matAstatic;
|
||||
@ -209,15 +210,34 @@ void test_linear_but_not_vectorizable()
|
||||
}
|
||||
}
|
||||
|
||||
template<int Rows>
|
||||
void bug_1311()
|
||||
{
|
||||
Matrix< double, Rows, 2 > A; A.setRandom();
|
||||
Vector2d b = Vector2d::Random() ;
|
||||
Matrix<double,Rows,1> res;
|
||||
res.noalias() = 1. * (A * b);
|
||||
VERIFY_IS_APPROX(res, A*b);
|
||||
res.noalias() = 1.*A * b;
|
||||
VERIFY_IS_APPROX(res, A*b);
|
||||
res.noalias() = (1.*A).lazyProduct(b);
|
||||
VERIFY_IS_APPROX(res, A*b);
|
||||
res.noalias() = (1.*A).lazyProduct(1.*b);
|
||||
VERIFY_IS_APPROX(res, A*b);
|
||||
res.noalias() = (A).lazyProduct(1.*b);
|
||||
VERIFY_IS_APPROX(res, A*b);
|
||||
}
|
||||
|
||||
void test_product_small()
|
||||
{
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
|
||||
CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
|
||||
CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
|
||||
CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
|
||||
CALL_SUBTEST_3( product(Matrix3d()) );
|
||||
CALL_SUBTEST_4( product(Matrix4d()) );
|
||||
CALL_SUBTEST_5( product(Matrix4f()) );
|
||||
CALL_SUBTEST_6( product1x1() );
|
||||
CALL_SUBTEST_6( product1x1<0>() );
|
||||
|
||||
CALL_SUBTEST_11( test_lazy_l1<float>() );
|
||||
CALL_SUBTEST_12( test_lazy_l2<float>() );
|
||||
@ -238,6 +258,9 @@ void test_product_small()
|
||||
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
|
||||
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
|
||||
CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
|
||||
|
||||
CALL_SUBTEST_6( bug_1311<3>() );
|
||||
CALL_SUBTEST_6( bug_1311<5>() );
|
||||
}
|
||||
|
||||
#ifdef EIGEN_TEST_PART_6
|
||||
|
@ -7,6 +7,16 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
template<typename T>
|
||||
Array<T,4,1> four_denorms();
|
||||
|
||||
template<>
|
||||
Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
|
||||
template<>
|
||||
Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
|
||||
template<typename T>
|
||||
Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
|
||||
|
||||
template<typename MatrixType>
|
||||
void svd_fill_random(MatrixType &m, int Option = 0)
|
||||
{
|
||||
@ -55,7 +65,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
|
||||
}
|
||||
|
||||
Matrix<Scalar,Dynamic,1> samples(9);
|
||||
samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
|
||||
samples << 0, four_denorms<RealScalar>(),
|
||||
-RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
|
||||
|
||||
if(Option==Symmetric)
|
||||
{
|
||||
|
@ -61,8 +61,9 @@ typedef unsigned __int64 uint64_t;
|
||||
#ifdef EIGEN_USE_GPU
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
#if defined(__CUDACC__)
|
||||
#include <curand_kernel.h>
|
||||
#if __cplusplus >= 201103L
|
||||
#include <atomic>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -81,6 +82,7 @@ typedef unsigned __int64 uint64_t;
|
||||
#include "src/Tensor/TensorDimensions.h"
|
||||
#include "src/Tensor/TensorInitializer.h"
|
||||
#include "src/Tensor/TensorTraits.h"
|
||||
#include "src/Tensor/TensorRandom.h"
|
||||
#include "src/Tensor/TensorUInt128.h"
|
||||
#include "src/Tensor/TensorIntDiv.h"
|
||||
#include "src/Tensor/TensorGlobalFunctions.h"
|
||||
|
@ -51,12 +51,15 @@ class TensorOpCost {
|
||||
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(compute_cycles) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
|
||||
bool vectorized, double packet_size)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
|
@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;
|
||||
|
||||
static void initializeDeviceProp() {
|
||||
if (!m_devicePropInitialized) {
|
||||
if (!m_devicePropInitialized) {
|
||||
// Attempts to ensure proper behavior in the case of multiple threads
|
||||
// calling this function simultaneously. This would be trivial to
|
||||
// implement if we could use std::mutex, but unfortunately mutex don't
|
||||
// compile with nvcc, so we resort to atomics and thread fences instead.
|
||||
// Note that if the caller uses a compiler that doesn't support c++11 we
|
||||
// can't ensure that the initialization is thread safe.
|
||||
#if __cplusplus >= 201103L
|
||||
static std::atomic<bool> first(true);
|
||||
if (first.exchange(false)) {
|
||||
#else
|
||||
static bool first = true;
|
||||
if (first) {
|
||||
first = false;
|
||||
#endif
|
||||
// We're the first thread to reach this point.
|
||||
int num_devices;
|
||||
cudaError_t status = cudaGetDeviceCount(&num_devices);
|
||||
if (status != cudaSuccess) {
|
||||
@ -63,7 +77,19 @@ static void initializeDeviceProp() {
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
#endif
|
||||
m_devicePropInitialized = true;
|
||||
} else {
|
||||
// Wait for the other thread to inititialize the properties.
|
||||
while (!m_devicePropInitialized) {
|
||||
#if __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
#endif
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -168,39 +194,20 @@ struct GpuDevice {
|
||||
return stream_->stream();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
return stream_->allocate(num_bytes);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
stream_->deallocate(buffer);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE void* scratchpad() const {
|
||||
return stream_->scratchpad();
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
|
||||
return stream_->semaphore();
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
@ -210,30 +217,22 @@ struct GpuDevice {
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
@ -242,21 +241,21 @@ struct GpuDevice {
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
// FIXME
|
||||
return 32;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
// FIXME
|
||||
return 48*1024;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// We won't try to take advantage of the l2 cache for the time being, and
|
||||
// there is no l3 cache on cuda devices.
|
||||
return firstLevelCacheSize();
|
||||
@ -276,56 +275,26 @@ struct GpuDevice {
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
|
||||
return stream_->deviceProperties().multiProcessorCount;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
|
||||
return stream_->deviceProperties().maxThreadsPerBlock;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
|
||||
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
return stream_->deviceProperties().sharedMemPerBlock;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
return stream_->deviceProperties().major;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
|
||||
return stream_->deviceProperties().minor;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
return max_blocks_;
|
||||
}
|
||||
|
||||
|
@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel(Evaluator memcopied_eval, Index size) {
|
||||
EigenMetaKernel(Evaluator eval, Index size) {
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
|
||||
// complex types such as evaluators we should really conform to the C++
|
||||
// standard and call a proper copy constructor.
|
||||
Evaluator eval(memcopied_eval);
|
||||
|
||||
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
||||
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
||||
}
|
||||
|
@ -99,7 +99,8 @@ template <typename T> struct SumReducer
|
||||
static const bool IsStateful = false;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
|
||||
(*accum) += t;
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
*accum = sum_op(*accum, t);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
|
||||
@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
|
||||
MeanReducer() : scalarCount_(0), packetCount_(0) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
|
||||
(*accum) += t;
|
||||
internal::scalar_sum_op<T> sum_op;
|
||||
*accum = sum_op(*accum, t);
|
||||
scalarCount_++;
|
||||
}
|
||||
template <typename Packet>
|
||||
@ -190,25 +192,25 @@ struct reducer_traits<MeanReducer<T>, Device> {
|
||||
|
||||
template <typename T, bool IsMax = true, bool IsInteger = true>
|
||||
struct MinMaxBottomValue {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::lowest();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, true, false> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return -Eigen::NumTraits<T>::infinity();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, false, true> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::highest();
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct MinMaxBottomValue<T, false, false> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
|
||||
return Eigen::NumTraits<T>::infinity();
|
||||
}
|
||||
};
|
||||
@ -439,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
|
||||
};
|
||||
|
||||
|
||||
// Random number generation
|
||||
namespace {
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ int get_random_seed() {
|
||||
return clock();
|
||||
}
|
||||
#else
|
||||
static inline int get_random_seed() {
|
||||
#ifdef _WIN32
|
||||
SYSTEMTIME st;
|
||||
GetSystemTime(&st);
|
||||
return st.wSecond + 1000 * st.wMilliseconds;
|
||||
#elif defined __APPLE__
|
||||
return static_cast<int>(mach_absolute_time());
|
||||
#else
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
return static_cast<int>(ts.tv_nsec);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
|
||||
// We're not compiling a cuda kernel
|
||||
template <typename T> class UniformRandomGenerator {
|
||||
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
if (!deterministic) {
|
||||
srand(get_random_seed());
|
||||
}
|
||||
}
|
||||
UniformRandomGenerator(const UniformRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
}
|
||||
|
||||
T operator()() const {
|
||||
return random<T>();
|
||||
}
|
||||
template<typename PacketType>
|
||||
PacketType packetOp() const {
|
||||
const int packetSize = internal::unpacket_traits<PacketType>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[i] = random<T>();
|
||||
}
|
||||
return internal::pload<PacketType>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
};
|
||||
|
||||
#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
|
||||
template <> class UniformRandomGenerator<float> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
|
||||
if (!deterministic) {
|
||||
m_generator->seed(get_random_seed());
|
||||
}
|
||||
}
|
||||
UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
|
||||
m_generator = new std::mt19937();
|
||||
m_generator->seed(other() * UINT_MAX);
|
||||
m_deterministic = other.m_deterministic;
|
||||
}
|
||||
~UniformRandomGenerator() {
|
||||
delete m_generator;
|
||||
}
|
||||
|
||||
float operator()() const {
|
||||
return m_distribution(*m_generator);
|
||||
}
|
||||
template<typename PacketType>
|
||||
PacketType packetOp() const {
|
||||
const int packetSize = internal::unpacket_traits<PacketType>::size;
|
||||
EIGEN_ALIGN_MAX float values[packetSize];
|
||||
for (int k = 0; k < packetSize; ++k) {
|
||||
values[k] = this->operator()();
|
||||
}
|
||||
return internal::pload<PacketType>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
UniformRandomGenerator& operator = (const UniformRandomGenerator&);
|
||||
// Make sure m_deterministic comes first to match the layout of the cpu
|
||||
// version of the code.
|
||||
bool m_deterministic;
|
||||
std::mt19937* m_generator;
|
||||
mutable std::uniform_real_distribution<float> m_distribution;
|
||||
};
|
||||
|
||||
template <> class UniformRandomGenerator<double> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
|
||||
if (!deterministic) {
|
||||
m_generator->seed(get_random_seed());
|
||||
}
|
||||
}
|
||||
UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
|
||||
m_generator = new std::mt19937();
|
||||
m_generator->seed(other() * UINT_MAX);
|
||||
m_deterministic = other.m_deterministic;
|
||||
}
|
||||
~UniformRandomGenerator() {
|
||||
delete m_generator;
|
||||
}
|
||||
|
||||
double operator()() const {
|
||||
return m_distribution(*m_generator);
|
||||
}
|
||||
template<typename PacketType>
|
||||
PacketType packetOp() const {
|
||||
const int packetSize = internal::unpacket_traits<PacketType>::size;
|
||||
EIGEN_ALIGN_MAX double values[packetSize];
|
||||
for (int k = 0; k < packetSize; ++k) {
|
||||
values[k] = this->operator()();
|
||||
}
|
||||
return internal::pload<PacketType>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
UniformRandomGenerator& operator = (const UniformRandomGenerator&);
|
||||
// Make sure m_deterministic comes first to match the layout of the cpu
|
||||
// version of the code.
|
||||
bool m_deterministic;
|
||||
std::mt19937* m_generator;
|
||||
mutable std::uniform_real_distribution<double> m_distribution;
|
||||
};
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
// We're compiling a cuda kernel
|
||||
template <typename T> class UniformRandomGenerator;
|
||||
|
||||
template <> class UniformRandomGenerator<float> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
|
||||
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
|
||||
__device__ float operator()() const {
|
||||
return curand_uniform(&m_state);
|
||||
}
|
||||
template<typename PacketType>
|
||||
__device__ float4 packetOp() const {
|
||||
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return curand_uniform4(&m_state);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class UniformRandomGenerator<double> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ double operator()() const {
|
||||
return curand_uniform_double(&m_state);
|
||||
}
|
||||
template<typename PacketType>
|
||||
__device__ double2 packetOp() const {
|
||||
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return curand_uniform2_double(&m_state);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class UniformRandomGenerator<std::complex<float> > {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
|
||||
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ std::complex<float> operator()() const {
|
||||
float4 vals = curand_uniform4(&m_state);
|
||||
return std::complex<float>(vals.x, vals.y);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class UniformRandomGenerator<std::complex<double> > {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
|
||||
__device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ std::complex<double> operator()() const {
|
||||
double2 vals = curand_uniform2_double(&m_state);
|
||||
return std::complex<double>(vals.x, vals.y);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<UniformRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
|
||||
// We're not compiling a cuda kernel
|
||||
template <typename T> class NormalRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
|
||||
if (!deterministic) {
|
||||
m_generator->seed(get_random_seed());
|
||||
}
|
||||
}
|
||||
NormalRandomGenerator(const NormalRandomGenerator& other)
|
||||
: m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
|
||||
m_generator->seed(other() * UINT_MAX);
|
||||
}
|
||||
~NormalRandomGenerator() {
|
||||
delete m_generator;
|
||||
}
|
||||
T operator()() const {
|
||||
return m_distribution(*m_generator);
|
||||
}
|
||||
template<typename PacketType>
|
||||
PacketType packetOp() const {
|
||||
const int packetSize = internal::unpacket_traits<PacketType>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[i] = m_distribution(*m_generator);
|
||||
}
|
||||
return internal::pload<PacketType>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
// No assignment
|
||||
NormalRandomGenerator& operator = (const NormalRandomGenerator&);
|
||||
|
||||
bool m_deterministic;
|
||||
mutable std::normal_distribution<T> m_distribution;
|
||||
std::mt19937* m_generator;
|
||||
};
|
||||
|
||||
#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
|
||||
|
||||
// We're compiling a cuda kernel
|
||||
template <typename T> class NormalRandomGenerator;
|
||||
|
||||
template <> class NormalRandomGenerator<float> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ float operator()() const {
|
||||
return curand_normal(&m_state);
|
||||
}
|
||||
template<typename PacketType>
|
||||
__device__ float4 packetOp() const {
|
||||
EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return curand_normal4(&m_state);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class NormalRandomGenerator<double> {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ double operator()() const {
|
||||
return curand_normal_double(&m_state);
|
||||
}
|
||||
template<typename PacketType>
|
||||
__device__ double2 packetOp() const {
|
||||
EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return curand_normal2_double(&m_state);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class NormalRandomGenerator<std::complex<float> > {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
|
||||
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ std::complex<float> operator()() const {
|
||||
float4 vals = curand_normal4(&m_state);
|
||||
return std::complex<float>(vals.x, vals.y);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
template <> class NormalRandomGenerator<std::complex<double> > {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
|
||||
__device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
|
||||
m_deterministic = other.m_deterministic;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int seed = m_deterministic ? 0 : get_random_seed();
|
||||
curand_init(seed, tid, 0, &m_state);
|
||||
}
|
||||
__device__ std::complex<double> operator()() const {
|
||||
double2 vals = curand_normal2_double(&m_state);
|
||||
return std::complex<double>(vals.x, vals.y);
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
mutable curandStatePhilox4_32_10_t m_state;
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
template <typename T> class NormalRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = false;
|
||||
NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
|
||||
|
||||
private:
|
||||
bool m_deterministic;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<NormalRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template <typename T, typename Index, size_t NumDims>
|
||||
class GaussianGenerator {
|
||||
public:
|
||||
@ -895,7 +455,7 @@ class GaussianGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
T operator()(const array<Index, NumDims>& coordinates) const {
|
||||
EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
|
||||
T tmp = T(0);
|
||||
for (size_t i = 0; i < NumDims; ++i) {
|
||||
T offset = coordinates[i] - m_means[i];
|
||||
|
276
unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
Normal file
276
unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
Normal file
@ -0,0 +1,276 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||
#ifdef __CUDA_ARCH__
|
||||
// We don't support 3d kernels since we currently only use 1 and
|
||||
// 2d kernels.
|
||||
assert(threadIdx.z == 0);
|
||||
return clock64() +
|
||||
blockIdx.x * blockDim.x + threadIdx.x +
|
||||
gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
|
||||
|
||||
#elif defined _WIN32
|
||||
// Use the current time as a baseline.
|
||||
GetSystemTime(&st);
|
||||
int time = st.wSecond + 1000 * st.wMilliseconds;
|
||||
// Mix in a random number to make sure that we get different seeds if
|
||||
// we try to generate seeds faster than the clock resolution.
|
||||
// We need 2 random values since the generator only generate 16 bits at
|
||||
// a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
|
||||
SYSTEMTIME st;
|
||||
uint rnd1 = ::rand();
|
||||
uint rnd2 = ::rand();
|
||||
uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
|
||||
return rnd;
|
||||
|
||||
#elif defined __APPLE__
|
||||
// Same approach as for win32, except that the random number generator
|
||||
// is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
|
||||
uint64_t rnd = ::random() ^ mach_absolute_time();
|
||||
return rnd;
|
||||
|
||||
#else
|
||||
// Augment the current time with pseudo random number generation
|
||||
// to ensure that we get different seeds if we try to generate seeds
|
||||
// faster than the clock resolution.
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
uint64_t rnd = ::random() ^ ts.tv_nsec;
|
||||
return rnd;
|
||||
#endif
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
|
||||
// TODO: Unify with the implementation in the non blocking thread pool.
|
||||
uint64_t current = *state;
|
||||
// Update the internal state
|
||||
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
// Generate the random output (using the PCG-XSH-RS scheme)
|
||||
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
|
||||
seed = seed ? seed : get_random_seed();
|
||||
return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T RandomToTypeUniform(uint64_t* state) {
|
||||
unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
return static_cast<T>(rnd);
|
||||
}
|
||||
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
|
||||
Eigen::half result;
|
||||
// Generate 10 random bits for the mantissa
|
||||
unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
result.x = static_cast<uint16_t>(rnd & 0x3ffu);
|
||||
// Set the exponent
|
||||
result.x |= (static_cast<uint16_t>(15) << 10);
|
||||
// Return the final result
|
||||
return result - Eigen::half(1.0f);
|
||||
}
|
||||
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float RandomToTypeUniform<float>(uint64_t* state) {
|
||||
typedef union {
|
||||
uint32_t raw;
|
||||
float fp;
|
||||
} internal;
|
||||
internal result;
|
||||
// Generate 23 random bits for the mantissa mantissa
|
||||
const unsigned rnd = PCG_XSH_RS_generator(state);
|
||||
result.raw = rnd & 0x7fffffu;
|
||||
// Set the exponent
|
||||
result.raw |= (static_cast<uint32_t>(127) << 23);
|
||||
// Return the final result
|
||||
return result.fp - 1.0f;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
double RandomToTypeUniform<double>(uint64_t* state) {
|
||||
typedef union {
|
||||
uint64_t raw;
|
||||
double dp;
|
||||
} internal;
|
||||
internal result;
|
||||
result.raw = 0;
|
||||
// Generate 52 random bits for the mantissa
|
||||
// First generate the upper 20 bits
|
||||
unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
|
||||
// The generate the lower 32 bits
|
||||
unsigned rnd2 = PCG_XSH_RS_generator(state);
|
||||
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
|
||||
// Set the exponent
|
||||
result.raw |= (static_cast<uint64_t>(1023) << 52);
|
||||
// Return the final result
|
||||
return result.dp - 1.0;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
|
||||
return std::complex<float>(RandomToTypeUniform<float>(state),
|
||||
RandomToTypeUniform<float>(state));
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
|
||||
return std::complex<double>(RandomToTypeUniform<double>(state),
|
||||
RandomToTypeUniform<double>(state));
|
||||
}
|
||||
|
||||
template <typename T> class UniformRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
// Uses the given "seed" if non-zero, otherwise uses a random seed.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
|
||||
uint64_t seed = 0) {
|
||||
m_state = PCG_XSH_RS_state(seed);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
|
||||
const UniformRandomGenerator& other) {
|
||||
m_state = other.m_state;
|
||||
}
|
||||
|
||||
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T operator()(Index i) const {
|
||||
uint64_t local_state = m_state + i;
|
||||
T result = RandomToTypeUniform<T>(&local_state);
|
||||
m_state = local_state;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Packet packetOp(Index i) const {
|
||||
const int packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
uint64_t local_state = m_state + i;
|
||||
for (int j = 0; j < packetSize; ++j) {
|
||||
values[j] = RandomToTypeUniform<T>(&local_state);
|
||||
}
|
||||
m_state = local_state;
|
||||
return internal::pload<Packet>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
mutable uint64_t m_state;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<UniformRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
|
||||
Cost = 12 * NumTraits<Scalar>::AddCost *
|
||||
((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
|
||||
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T RandomToTypeNormal(uint64_t* state) {
|
||||
// Use the ratio of uniform method to generate numbers following a normal
|
||||
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
|
||||
// details.
|
||||
T u, v, q;
|
||||
do {
|
||||
u = RandomToTypeUniform<T>(state);
|
||||
v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
|
||||
const T x = u - T(0.449871);
|
||||
const T y = numext::abs(v) + T(0.386595);
|
||||
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
|
||||
} while (q > T(0.27597) &&
|
||||
(q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
|
||||
|
||||
return v/u;
|
||||
}
|
||||
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
|
||||
return std::complex<float>(RandomToTypeNormal<float>(state),
|
||||
RandomToTypeNormal<float>(state));
|
||||
}
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
|
||||
return std::complex<double>(RandomToTypeNormal<double>(state),
|
||||
RandomToTypeNormal<double>(state));
|
||||
}
|
||||
|
||||
|
||||
template <typename T> class NormalRandomGenerator {
|
||||
public:
|
||||
static const bool PacketAccess = true;
|
||||
|
||||
// Uses the given "seed" if non-zero, otherwise uses a random seed.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
|
||||
m_state = PCG_XSH_RS_state(seed);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
|
||||
const NormalRandomGenerator& other) {
|
||||
m_state = other.m_state;
|
||||
}
|
||||
|
||||
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
T operator()(Index i) const {
|
||||
uint64_t local_state = m_state + i;
|
||||
T result = RandomToTypeNormal<T>(&local_state);
|
||||
m_state = local_state;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Packet packetOp(Index i) const {
|
||||
const int packetSize = internal::unpacket_traits<Packet>::size;
|
||||
EIGEN_ALIGN_MAX T values[packetSize];
|
||||
uint64_t local_state = m_state + i;
|
||||
for (int j = 0; j < packetSize; ++j) {
|
||||
values[j] = RandomToTypeNormal<T>(&local_state);
|
||||
}
|
||||
m_state = local_state;
|
||||
return internal::pload<Packet>(values);
|
||||
}
|
||||
|
||||
private:
|
||||
mutable uint64_t m_state;
|
||||
};
|
||||
|
||||
|
||||
template <typename Scalar>
|
||||
struct functor_traits<NormalRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// On average, we need to generate about 3 random numbers
|
||||
// 15 mul, 8 add, 1.5 logs
|
||||
Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
|
||||
15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
|
||||
3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
|
||||
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
|
@ -20,37 +20,60 @@ public:
|
||||
AutoDiffJacobian(const Functor& f) : Functor(f) {}
|
||||
|
||||
// forward constructors
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
template<typename... T>
|
||||
AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
|
||||
#else
|
||||
template<typename T0>
|
||||
AutoDiffJacobian(const T0& a0) : Functor(a0) {}
|
||||
template<typename T0, typename T1>
|
||||
AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
|
||||
template<typename T0, typename T1, typename T2>
|
||||
AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
|
||||
|
||||
enum {
|
||||
InputsAtCompileTime = Functor::InputsAtCompileTime,
|
||||
ValuesAtCompileTime = Functor::ValuesAtCompileTime
|
||||
};
|
||||
#endif
|
||||
|
||||
typedef typename Functor::InputType InputType;
|
||||
typedef typename Functor::ValueType ValueType;
|
||||
typedef typename Functor::JacobianType JacobianType;
|
||||
typedef typename JacobianType::Scalar Scalar;
|
||||
typedef typename ValueType::Scalar Scalar;
|
||||
|
||||
enum {
|
||||
InputsAtCompileTime = InputType::RowsAtCompileTime,
|
||||
ValuesAtCompileTime = ValueType::RowsAtCompileTime
|
||||
};
|
||||
|
||||
typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
|
||||
typedef typename JacobianType::Index Index;
|
||||
|
||||
typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
|
||||
typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
|
||||
typedef AutoDiffScalar<DerivativeType> ActiveScalar;
|
||||
|
||||
|
||||
typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
|
||||
typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
// Some compilers don't accept variadic parameters after a default parameter,
|
||||
// i.e., we can't just write _jac=0 but we need to overload operator():
|
||||
EIGEN_STRONG_INLINE
|
||||
void operator() (const InputType& x, ValueType* v) const
|
||||
{
|
||||
this->operator()(x, v, 0);
|
||||
}
|
||||
template<typename... ParamsType>
|
||||
void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
|
||||
const ParamsType&... Params) const
|
||||
#else
|
||||
void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
|
||||
#endif
|
||||
{
|
||||
eigen_assert(v!=0);
|
||||
|
||||
if (!_jac)
|
||||
{
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
Functor::operator()(x, v, Params...);
|
||||
#else
|
||||
Functor::operator()(x, v);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@ -61,12 +84,16 @@ public:
|
||||
|
||||
if(InputsAtCompileTime==Dynamic)
|
||||
for (Index j=0; j<jac.rows(); j++)
|
||||
av[j].derivatives().resize(this->inputs());
|
||||
av[j].derivatives().resize(x.rows());
|
||||
|
||||
for (Index i=0; i<jac.cols(); i++)
|
||||
ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
|
||||
ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
Functor::operator()(ax, &av, Params...);
|
||||
#else
|
||||
Functor::operator()(ax, &av);
|
||||
#endif
|
||||
|
||||
for (Index i=0; i<jac.rows(); i++)
|
||||
{
|
||||
@ -74,8 +101,6 @@ public:
|
||||
jac.row(i) = av[i].derivatives();
|
||||
}
|
||||
}
|
||||
protected:
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -189,7 +189,12 @@ namespace Eigen
|
||||
res[0] = atan2(mat(J,K), mat(K,K));
|
||||
Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
|
||||
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
|
||||
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
|
||||
if(res[0] > Scalar(0)) {
|
||||
res[0] -= Scalar(EIGEN_PI);
|
||||
}
|
||||
else {
|
||||
res[0] += Scalar(EIGEN_PI);
|
||||
}
|
||||
res[1] = atan2(-mat(I,K), -c2);
|
||||
}
|
||||
else
|
||||
@ -212,7 +217,12 @@ namespace Eigen
|
||||
res[0] = atan2(mat(J,I), mat(K,I));
|
||||
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
|
||||
{
|
||||
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
|
||||
if(res[0] > Scalar(0)) {
|
||||
res[0] -= Scalar(EIGEN_PI);
|
||||
}
|
||||
else {
|
||||
res[0] += Scalar(EIGEN_PI);
|
||||
}
|
||||
Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
|
||||
res[1] = -atan2(s2, mat(I,I));
|
||||
}
|
||||
|
@ -203,7 +203,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
|
||||
|
||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||
set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
|
||||
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
|
||||
endif()
|
||||
if(EIGEN_TEST_CUDA_CLANG)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
|
||||
@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
|
||||
ei_add_test(cxx11_tensor_complex_cuda)
|
||||
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
|
||||
ei_add_test(cxx11_tensor_reduction_cuda)
|
||||
ei_add_test(cxx11_tensor_argmax_cuda)
|
||||
ei_add_test(cxx11_tensor_cast_float16_cuda)
|
||||
|
@ -105,6 +105,89 @@ struct TestFunc1
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
/* Test functor for the C++11 features. */
|
||||
template <typename Scalar>
|
||||
struct integratorFunctor
|
||||
{
|
||||
typedef Matrix<Scalar, 2, 1> InputType;
|
||||
typedef Matrix<Scalar, 2, 1> ValueType;
|
||||
|
||||
/*
|
||||
* Implementation starts here.
|
||||
*/
|
||||
integratorFunctor(const Scalar gain) : _gain(gain) {}
|
||||
integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
|
||||
const Scalar _gain;
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void operator() (const T1 &input, T2 *output, const Scalar dt) const
|
||||
{
|
||||
T2 &o = *output;
|
||||
|
||||
/* Integrator to test the AD. */
|
||||
o[0] = input[0] + input[1] * dt * _gain;
|
||||
o[1] = input[1] * _gain;
|
||||
}
|
||||
|
||||
/* Only needed for the test */
|
||||
template <typename T1, typename T2, typename T3>
|
||||
void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
|
||||
{
|
||||
T2 &o = *output;
|
||||
|
||||
/* Integrator to test the AD. */
|
||||
o[0] = input[0] + input[1] * dt * _gain;
|
||||
o[1] = input[1] * _gain;
|
||||
|
||||
if (jacobian)
|
||||
{
|
||||
T3 &j = *jacobian;
|
||||
|
||||
j(0, 0) = 1;
|
||||
j(0, 1) = dt * _gain;
|
||||
j(1, 0) = 0;
|
||||
j(1, 1) = _gain;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<typename Func> void forward_jacobian_cpp11(const Func& f)
|
||||
{
|
||||
typedef typename Func::ValueType::Scalar Scalar;
|
||||
typedef typename Func::ValueType ValueType;
|
||||
typedef typename Func::InputType InputType;
|
||||
typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
|
||||
|
||||
InputType x = InputType::Random(InputType::RowsAtCompileTime);
|
||||
ValueType y, yref;
|
||||
JacobianType j, jref;
|
||||
|
||||
const Scalar dt = internal::random<double>();
|
||||
|
||||
jref.setZero();
|
||||
yref.setZero();
|
||||
f(x, &yref, &jref, dt);
|
||||
|
||||
//std::cerr << "y, yref, jref: " << "\n";
|
||||
//std::cerr << y.transpose() << "\n\n";
|
||||
//std::cerr << yref << "\n\n";
|
||||
//std::cerr << jref << "\n\n";
|
||||
|
||||
AutoDiffJacobian<Func> autoj(f);
|
||||
autoj(x, &y, &j, dt);
|
||||
|
||||
//std::cerr << "y j (via autodiff): " << "\n";
|
||||
//std::cerr << y.transpose() << "\n\n";
|
||||
//std::cerr << j << "\n\n";
|
||||
|
||||
VERIFY_IS_APPROX(y, yref);
|
||||
VERIFY_IS_APPROX(j, jref);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename Func> void forward_jacobian(const Func& f)
|
||||
{
|
||||
typename Func::InputType x = Func::InputType::Random(f.inputs());
|
||||
@ -128,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
|
||||
VERIFY_IS_APPROX(j, jref);
|
||||
}
|
||||
|
||||
|
||||
// TODO also check actual derivatives!
|
||||
template <int>
|
||||
void test_autodiff_scalar()
|
||||
@ -141,6 +223,7 @@ void test_autodiff_scalar()
|
||||
VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
|
||||
}
|
||||
|
||||
|
||||
// TODO also check actual derivatives!
|
||||
template <int>
|
||||
void test_autodiff_vector()
|
||||
@ -151,7 +234,7 @@ void test_autodiff_vector()
|
||||
VectorAD ap = p.cast<AD>();
|
||||
ap.x().derivatives() = Vector2f::UnitX();
|
||||
ap.y().derivatives() = Vector2f::UnitY();
|
||||
|
||||
|
||||
AD res = foo<VectorAD>(ap);
|
||||
VERIFY_IS_APPROX(res.value(), foo(p));
|
||||
}
|
||||
@ -164,6 +247,9 @@ void test_autodiff_jacobian()
|
||||
CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
|
||||
CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
|
||||
CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -71,8 +71,45 @@ void test_cuda_nullary() {
|
||||
}
|
||||
|
||||
|
||||
static void test_cuda_sum_reductions() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
const int num_cols = internal::random<int>(1024, 5*1024);
|
||||
|
||||
Tensor<std::complex<float>, 2> in(num_rows, num_cols);
|
||||
in.setRandom();
|
||||
|
||||
Tensor<std::complex<float>, 0> full_redux;
|
||||
full_redux = in.sum();
|
||||
|
||||
std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
|
||||
std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
|
||||
std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
|
||||
std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
|
||||
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
|
||||
|
||||
TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
|
||||
TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
|
||||
|
||||
out_gpu.device(gpu_device) = in_gpu.sum();
|
||||
|
||||
Tensor<std::complex<float>, 0> full_redux_gpu;
|
||||
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
|
||||
gpu_device.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
|
||||
|
||||
gpu_device.deallocate(gpu_in_ptr);
|
||||
gpu_device.deallocate(gpu_out_ptr);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_complex()
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_nullary());
|
||||
CALL_SUBTEST(test_cuda_sum_reductions());
|
||||
}
|
||||
|
97
unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
Normal file
97
unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
Normal file
@ -0,0 +1,97 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template<typename T>
|
||||
void test_cuda_complex_cwise_ops() {
|
||||
const int kNumItems = 2;
|
||||
std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
|
||||
|
||||
std::complex<T>* d_in1;
|
||||
std::complex<T>* d_in2;
|
||||
std::complex<T>* d_out;
|
||||
cudaMalloc((void**)(&d_in1), complex_bytes);
|
||||
cudaMalloc((void**)(&d_in2), complex_bytes);
|
||||
cudaMalloc((void**)(&d_out), complex_bytes);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
|
||||
d_in1, kNumItems);
|
||||
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
|
||||
d_in2, kNumItems);
|
||||
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
|
||||
d_out, kNumItems);
|
||||
|
||||
const std::complex<T> a(3.14f, 2.7f);
|
||||
const std::complex<T> b(-10.6f, 1.4f);
|
||||
|
||||
gpu_in1.device(gpu_device) = gpu_in1.constant(a);
|
||||
gpu_in2.device(gpu_device) = gpu_in2.constant(b);
|
||||
|
||||
enum CwiseOp {
|
||||
Add = 0,
|
||||
Sub,
|
||||
Mul,
|
||||
Div
|
||||
};
|
||||
|
||||
Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
|
||||
for (int op = Add; op <= Div; op++) {
|
||||
std::complex<T> expected;
|
||||
switch (static_cast<CwiseOp>(op)) {
|
||||
case Add:
|
||||
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
|
||||
expected = a + b;
|
||||
break;
|
||||
case Sub:
|
||||
gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
|
||||
expected = a - b;
|
||||
break;
|
||||
case Mul:
|
||||
gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
|
||||
expected = a * b;
|
||||
break;
|
||||
case Div:
|
||||
gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
|
||||
expected = a / b;
|
||||
break;
|
||||
}
|
||||
assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
|
||||
gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
|
||||
for (int i = 0; i < kNumItems; ++i) {
|
||||
VERIFY_IS_APPROX(actual(i), expected);
|
||||
}
|
||||
}
|
||||
|
||||
cudaFree(d_in1);
|
||||
cudaFree(d_in2);
|
||||
cudaFree(d_out);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_complex_cwise_ops()
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
|
||||
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user