mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-12 09:23:12 +08:00
Merged eigen/eigen into default
This commit is contained in:
commit
df7644aec3
@ -22,6 +22,7 @@
|
|||||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
|
||||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||||
#elif defined(__CUDACC_VER__)
|
#elif defined(__CUDACC_VER__)
|
||||||
|
@ -27,7 +27,7 @@ void qFree(void *ptr)
|
|||||||
void *qRealloc(void *ptr, std::size_t size)
|
void *qRealloc(void *ptr, std::size_t size)
|
||||||
{
|
{
|
||||||
void* newPtr = Eigen::internal::aligned_malloc(size);
|
void* newPtr = Eigen::internal::aligned_malloc(size);
|
||||||
memcpy(newPtr, ptr, size);
|
std::memcpy(newPtr, ptr, size);
|
||||||
Eigen::internal::aligned_free(ptr);
|
Eigen::internal::aligned_free(ptr);
|
||||||
return newPtr;
|
return newPtr;
|
||||||
}
|
}
|
||||||
|
@ -396,6 +396,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
|||||||
*/
|
*/
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
template<typename OtherDerived>
|
template<typename OtherDerived>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
inline const Product<Derived, OtherDerived>
|
inline const Product<Derived, OtherDerived>
|
||||||
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||||
{
|
{
|
||||||
|
@ -114,7 +114,7 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
|
|||||||
inline Index outerStride() const
|
inline Index outerStride() const
|
||||||
{
|
{
|
||||||
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
||||||
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? internal::traits<Map>::OuterStrideAtCompileTime
|
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
|
||||||
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
||||||
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
|
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
|
||||||
: (this->rows() * innerStride());
|
: (this->rows() * innerStride());
|
||||||
|
@ -99,7 +99,7 @@ class NoAlias
|
|||||||
* \sa class NoAlias
|
* \sa class NoAlias
|
||||||
*/
|
*/
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
|
NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
|
||||||
{
|
{
|
||||||
return NoAlias<Derived, Eigen::MatrixBase >(derived());
|
return NoAlias<Derived, Eigen::MatrixBase >(derived());
|
||||||
}
|
}
|
||||||
|
@ -50,38 +50,45 @@ struct half;
|
|||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
// Make our own __half_raw definition that is similar to CUDA's.
|
||||||
// Make our own __half definition that is similar to CUDA's.
|
struct __half_raw {
|
||||||
struct __half {
|
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||||
EIGEN_DEVICE_FUNC __half() : x(0) {}
|
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||||
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
|
|
||||||
unsigned short x;
|
unsigned short x;
|
||||||
};
|
};
|
||||||
|
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
|
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||||
|
typedef __half __half_raw;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
|
||||||
|
|
||||||
struct half_base : public __half {
|
struct half_base : public __half_raw {
|
||||||
EIGEN_DEVICE_FUNC half_base() {}
|
EIGEN_DEVICE_FUNC half_base() {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
|
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
|
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
|
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace half_impl
|
} // namespace half_impl
|
||||||
|
|
||||||
// Class definition.
|
// Class definition.
|
||||||
struct half : public half_impl::half_base {
|
struct half : public half_impl::half_base {
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||||
typedef half_impl::__half __half;
|
typedef half_impl::__half_raw __half_raw;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half() {}
|
EIGEN_DEVICE_FUNC half() {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
|
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||||
@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
|
|||||||
// these in hardware. If we need more performance on older/other CPUs, they are
|
// these in hardware. If we need more performance on older/other CPUs, they are
|
||||||
// also possible to vectorize directly.
|
// also possible to vectorize directly.
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
|
||||||
__half h;
|
__half_raw h;
|
||||||
h.x = x;
|
h.x = x;
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
@ -280,12 +287,13 @@ union FP32 {
|
|||||||
float f;
|
float f;
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __float2half(ff);
|
__half tmp_ff = __float2half(ff);
|
||||||
|
return *(__half_raw*)&tmp_ff;
|
||||||
|
|
||||||
#elif defined(EIGEN_HAS_FP16_C)
|
#elif defined(EIGEN_HAS_FP16_C)
|
||||||
__half h;
|
__half_raw h;
|
||||||
h.x = _cvtss_sh(ff, 0);
|
h.x = _cvtss_sh(ff, 0);
|
||||||
return h;
|
return h;
|
||||||
|
|
||||||
@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||||||
const FP32 f16max = { (127 + 16) << 23 };
|
const FP32 f16max = { (127 + 16) << 23 };
|
||||||
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||||
unsigned int sign_mask = 0x80000000u;
|
unsigned int sign_mask = 0x80000000u;
|
||||||
__half o;
|
__half_raw o;
|
||||||
o.x = static_cast<unsigned short>(0x0u);
|
o.x = static_cast<unsigned short>(0x0u);
|
||||||
|
|
||||||
unsigned int sign = f.u & sign_mask;
|
unsigned int sign = f.u & sign_mask;
|
||||||
@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __half2float(h);
|
return __half2float(h);
|
||||||
|
|
||||||
@ -512,8 +520,8 @@ struct numeric_limits<Eigen::half> {
|
|||||||
static const bool is_bounded = false;
|
static const bool is_bounded = false;
|
||||||
static const bool is_modulo = false;
|
static const bool is_modulo = false;
|
||||||
static const int digits = 11;
|
static const int digits = 11;
|
||||||
static const int digits10 = 2;
|
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||||
//static const int max_digits10 = ;
|
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||||
static const int radix = 2;
|
static const int radix = 2;
|
||||||
static const int min_exponent = -13;
|
static const int min_exponent = -13;
|
||||||
static const int min_exponent10 = -4;
|
static const int min_exponent10 = -4;
|
||||||
@ -612,11 +620,15 @@ struct hash<Eigen::half> {
|
|||||||
// Add the missing shfl_xor intrinsic
|
// Add the missing shfl_xor intrinsic
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||||
|
#if EIGEN_CUDACC_VER < 90000
|
||||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||||
|
#else
|
||||||
|
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||||
return Eigen::half_impl::raw_uint16_to_half(
|
return Eigen::half_impl::raw_uint16_to_half(
|
||||||
|
@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
|
|||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||||
half2 result;
|
half2 result;
|
||||||
result.x = a.x & 0x7FFF7FFF;
|
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
|
||||||
|
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -410,6 +410,16 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Does the compiler support type_trais?
|
||||||
|
#ifndef EIGEN_HAS_TYPE_TRAITS
|
||||||
|
#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700)
|
||||||
|
#define EIGEN_HAS_TYPE_TRAITS 1
|
||||||
|
#define EIGEN_INCLUDE_TYPE_TRAITS
|
||||||
|
#else
|
||||||
|
#define EIGEN_HAS_TYPE_TRAITS 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
// Does the compiler support variadic templates?
|
// Does the compiler support variadic templates?
|
||||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||||
|
@ -493,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
|
|||||||
IntPtr size = IntPtr(end)-IntPtr(start);
|
IntPtr size = IntPtr(end)-IntPtr(start);
|
||||||
if(size==0) return;
|
if(size==0) return;
|
||||||
eigen_internal_assert(start!=0 && end!=0 && target!=0);
|
eigen_internal_assert(start!=0 && end!=0 && target!=0);
|
||||||
memcpy(target, start, size);
|
std::memcpy(target, start, size);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -696,7 +696,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
|||||||
/** \class aligned_allocator
|
/** \class aligned_allocator
|
||||||
* \ingroup Core_Module
|
* \ingroup Core_Module
|
||||||
*
|
*
|
||||||
* \brief STL compatible allocator to use with with 16 byte aligned types
|
* \brief STL compatible allocator to use with types requiring a non standrad alignment.
|
||||||
|
*
|
||||||
|
* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
|
||||||
|
* By default, it will thus provide at least 16 bytes alignment and more in following cases:
|
||||||
|
* - 32 bytes alignment if AVX is enabled.
|
||||||
|
* - 64 bytes alignment if AVX512 is enabled.
|
||||||
|
*
|
||||||
|
* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
|
||||||
|
* \link TopicPreprocessorDirectivesPerformance there \endlink.
|
||||||
*
|
*
|
||||||
* Example:
|
* Example:
|
||||||
* \code
|
* \code
|
||||||
|
@ -34,6 +34,18 @@ inline IndexDest convert_index(const IndexSrc& idx) {
|
|||||||
return IndexDest(idx);
|
return IndexDest(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// true if T can be considered as an integral index (i.e., and integral type or enum)
|
||||||
|
template<typename T> struct is_valid_index_type
|
||||||
|
{
|
||||||
|
enum { value =
|
||||||
|
#if EIGEN_HAS_TYPE_TRAITS
|
||||||
|
internal::is_integral<T>::value || std::is_enum<T>::value
|
||||||
|
#else
|
||||||
|
// without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
|
||||||
|
internal::is_convertible<T,Index>::value
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
|
// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
|
||||||
// expression * scalar
|
// expression * scalar
|
||||||
|
@ -309,35 +309,40 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
template<typename VectorX, typename VectorY, typename OtherScalar>
|
|
||||||
void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
|
template<typename Scalar, typename OtherScalar,
|
||||||
|
int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
|
||||||
|
struct apply_rotation_in_the_plane_selector
|
||||||
|
{
|
||||||
|
static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
|
||||||
|
{
|
||||||
|
for(Index i=0; i<size; ++i)
|
||||||
|
{
|
||||||
|
Scalar xi = *x;
|
||||||
|
Scalar yi = *y;
|
||||||
|
*x = c * xi + numext::conj(s) * yi;
|
||||||
|
*y = -s * xi + numext::conj(c) * yi;
|
||||||
|
x += incrx;
|
||||||
|
y += incry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Scalar, typename OtherScalar,
|
||||||
|
int SizeAtCompileTime, int MinAlignment>
|
||||||
|
struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,true /* vectorizable */>
|
||||||
|
{
|
||||||
|
static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
|
||||||
{
|
{
|
||||||
typedef typename VectorX::Scalar Scalar;
|
|
||||||
enum {
|
enum {
|
||||||
PacketSize = packet_traits<Scalar>::size,
|
PacketSize = packet_traits<Scalar>::size,
|
||||||
OtherPacketSize = packet_traits<OtherScalar>::size
|
OtherPacketSize = packet_traits<OtherScalar>::size
|
||||||
};
|
};
|
||||||
typedef typename packet_traits<Scalar>::type Packet;
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
typedef typename packet_traits<OtherScalar>::type OtherPacket;
|
typedef typename packet_traits<OtherScalar>::type OtherPacket;
|
||||||
eigen_assert(xpr_x.size() == xpr_y.size());
|
|
||||||
Index size = xpr_x.size();
|
|
||||||
Index incrx = xpr_x.derived().innerStride();
|
|
||||||
Index incry = xpr_y.derived().innerStride();
|
|
||||||
|
|
||||||
Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
|
|
||||||
Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
|
|
||||||
|
|
||||||
OtherScalar c = j.c();
|
|
||||||
OtherScalar s = j.s();
|
|
||||||
if (c==OtherScalar(1) && s==OtherScalar(0))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*** dynamic-size vectorized paths ***/
|
/*** dynamic-size vectorized paths ***/
|
||||||
|
if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1))
|
||||||
if(VectorX::SizeAtCompileTime == Dynamic &&
|
|
||||||
(VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
|
|
||||||
(PacketSize == OtherPacketSize) &&
|
|
||||||
((incrx==1 && incry==1) || PacketSize == 1))
|
|
||||||
{
|
{
|
||||||
// both vectors are sequentially stored in memory => vectorization
|
// both vectors are sequentially stored in memory => vectorization
|
||||||
enum { Peeling = 2 };
|
enum { Peeling = 2 };
|
||||||
@ -408,10 +413,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*** fixed-size vectorized path ***/
|
/*** fixed-size vectorized path ***/
|
||||||
else if(VectorX::SizeAtCompileTime != Dynamic &&
|
else if(SizeAtCompileTime != Dynamic && MinAlignment>0) // FIXME should be compared to the required alignment
|
||||||
(VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
|
|
||||||
(PacketSize == OtherPacketSize) &&
|
|
||||||
(EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
|
|
||||||
{
|
{
|
||||||
const OtherPacket pc = pset1<OtherPacket>(c);
|
const OtherPacket pc = pset1<OtherPacket>(c);
|
||||||
const OtherPacket ps = pset1<OtherPacket>(s);
|
const OtherPacket ps = pset1<OtherPacket>(s);
|
||||||
@ -433,16 +435,36 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
|
|||||||
/*** non-vectorized path ***/
|
/*** non-vectorized path ***/
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for(Index i=0; i<size; ++i)
|
apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,false>::run(x,incrx,y,incry,size,c,s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename VectorX, typename VectorY, typename OtherScalar>
|
||||||
|
void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
|
||||||
{
|
{
|
||||||
Scalar xi = *x;
|
typedef typename VectorX::Scalar Scalar;
|
||||||
Scalar yi = *y;
|
const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit)
|
||||||
*x = c * xi + numext::conj(s) * yi;
|
&& (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
|
||||||
*y = -s * xi + numext::conj(c) * yi;
|
|
||||||
x += incrx;
|
eigen_assert(xpr_x.size() == xpr_y.size());
|
||||||
y += incry;
|
Index size = xpr_x.size();
|
||||||
}
|
Index incrx = xpr_x.derived().innerStride();
|
||||||
}
|
Index incry = xpr_y.derived().innerStride();
|
||||||
|
|
||||||
|
Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
|
||||||
|
Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
|
||||||
|
|
||||||
|
OtherScalar c = j.c();
|
||||||
|
OtherScalar s = j.s();
|
||||||
|
if (c==OtherScalar(1) && s==OtherScalar(0))
|
||||||
|
return;
|
||||||
|
|
||||||
|
apply_rotation_in_the_plane_selector<
|
||||||
|
Scalar,OtherScalar,
|
||||||
|
VectorX::SizeAtCompileTime,
|
||||||
|
EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment),
|
||||||
|
Vectorizable>::run(x,incrx,y,incry,size,c,s);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -94,7 +94,7 @@ class AmbiVector
|
|||||||
Index allocSize = m_allocatedElements * sizeof(ListEl);
|
Index allocSize = m_allocatedElements * sizeof(ListEl);
|
||||||
allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
|
allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
|
||||||
Scalar* newBuffer = new Scalar[allocSize];
|
Scalar* newBuffer = new Scalar[allocSize];
|
||||||
memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
|
std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
|
||||||
delete[] m_buffer;
|
delete[] m_buffer;
|
||||||
m_buffer = newBuffer;
|
m_buffer = newBuffer;
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,9 @@ namespace internal {
|
|||||||
template<typename Lhs, typename Rhs, typename ResultType>
|
template<typename Lhs, typename Rhs, typename ResultType>
|
||||||
static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
|
static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
|
||||||
{
|
{
|
||||||
typedef typename remove_all<Lhs>::type::Scalar Scalar;
|
typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
|
||||||
|
typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
|
||||||
|
typedef typename remove_all<ResultType>::type::Scalar ResScalar;
|
||||||
|
|
||||||
// make sure to call innerSize/outerSize since we fake the storage order.
|
// make sure to call innerSize/outerSize since we fake the storage order.
|
||||||
Index rows = lhs.innerSize();
|
Index rows = lhs.innerSize();
|
||||||
@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
|
|||||||
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0);
|
ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0);
|
ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
|
ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
|
||||||
|
|
||||||
std::memset(mask,0,sizeof(bool)*rows);
|
std::memset(mask,0,sizeof(bool)*rows);
|
||||||
@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
|
|||||||
Index nnz = 0;
|
Index nnz = 0;
|
||||||
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
|
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
|
||||||
{
|
{
|
||||||
Scalar y = rhsIt.value();
|
RhsScalar y = rhsIt.value();
|
||||||
Index k = rhsIt.index();
|
Index k = rhsIt.index();
|
||||||
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
|
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
|
||||||
{
|
{
|
||||||
Index i = lhsIt.index();
|
Index i = lhsIt.index();
|
||||||
Scalar x = lhsIt.value();
|
LhsScalar x = lhsIt.value();
|
||||||
if(!mask[i])
|
if(!mask[i])
|
||||||
{
|
{
|
||||||
mask[i] = true;
|
mask[i] = true;
|
||||||
@ -166,10 +168,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
|
typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs;
|
||||||
RowMajorMatrix rhsRow = rhs;
|
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
|
||||||
RowMajorMatrix resRow(lhs.rows(), rhs.cols());
|
RowMajorRhs rhsRow = rhs;
|
||||||
internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
|
RowMajorRes resRow(lhs.rows(), rhs.cols());
|
||||||
|
internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow);
|
||||||
res = resRow;
|
res = resRow;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
|
typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs;
|
||||||
RowMajorMatrix lhsRow = lhs;
|
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
|
||||||
RowMajorMatrix resRow(lhs.rows(), rhs.cols());
|
RowMajorLhs lhsRow = lhs;
|
||||||
internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
|
RowMajorRes resRow(lhs.rows(), rhs.cols());
|
||||||
|
internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow);
|
||||||
res = resRow;
|
res = resRow;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
|
typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
|
||||||
ColMajorMatrix lhsCol = lhs;
|
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
|
||||||
ColMajorMatrix resCol(lhs.rows(), rhs.cols());
|
ColMajorLhs lhsCol = lhs;
|
||||||
internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
|
ColMajorRes resCol(lhs.rows(), rhs.cols());
|
||||||
|
internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol);
|
||||||
res = resCol;
|
res = resCol;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
|
typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
|
||||||
ColMajorMatrix rhsCol = rhs;
|
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
|
||||||
ColMajorMatrix resCol(lhs.rows(), rhs.cols());
|
ColMajorRhs rhsCol = rhs;
|
||||||
internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
|
ColMajorRes resCol(lhs.rows(), rhs.cols());
|
||||||
|
internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol);
|
||||||
res = resCol;
|
res = resCol;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -263,7 +269,8 @@ namespace internal {
|
|||||||
template<typename Lhs, typename Rhs, typename ResultType>
|
template<typename Lhs, typename Rhs, typename ResultType>
|
||||||
static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef typename remove_all<Lhs>::type::Scalar Scalar;
|
typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
|
||||||
|
typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
|
||||||
Index cols = rhs.outerSize();
|
Index cols = rhs.outerSize();
|
||||||
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
||||||
|
|
||||||
@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs,
|
|||||||
{
|
{
|
||||||
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
|
for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
|
||||||
{
|
{
|
||||||
Scalar y = rhsIt.value();
|
RhsScalar y = rhsIt.value();
|
||||||
Index k = rhsIt.index();
|
Index k = rhsIt.index();
|
||||||
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
|
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
|
||||||
{
|
{
|
||||||
Index i = lhsIt.index();
|
Index i = lhsIt.index();
|
||||||
Scalar x = lhsIt.value();
|
LhsScalar x = lhsIt.value();
|
||||||
res.coeffRef(i,j) += x * y;
|
res.coeffRef(i,j) += x * y;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMa
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
|
typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
|
||||||
ColMajorMatrix lhsCol(lhs);
|
ColMajorLhs lhsCol(lhs);
|
||||||
internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
|
internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMa
|
|||||||
{
|
{
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
|
typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
|
||||||
ColMajorMatrix rhsCol(rhs);
|
ColMajorRhs rhsCol(rhs);
|
||||||
internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
|
internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
|
|||||||
{
|
{
|
||||||
// return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
|
// return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
|
||||||
|
|
||||||
typedef typename remove_all<Lhs>::type::Scalar Scalar;
|
typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
|
||||||
|
typedef typename remove_all<ResultType>::type::Scalar ResScalar;
|
||||||
typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
|
typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
|
||||||
|
|
||||||
// make sure to call innerSize/outerSize since we fake the storage order.
|
// make sure to call innerSize/outerSize since we fake the storage order.
|
||||||
@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
|
|||||||
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
eigen_assert(lhs.outerSize() == rhs.innerSize());
|
||||||
|
|
||||||
// allocate a temporary buffer
|
// allocate a temporary buffer
|
||||||
AmbiVector<Scalar,StorageIndex> tempVector(rows);
|
AmbiVector<ResScalar,StorageIndex> tempVector(rows);
|
||||||
|
|
||||||
// mimics a resizeByInnerOuter:
|
// mimics a resizeByInnerOuter:
|
||||||
if(ResultType::IsRowMajor)
|
if(ResultType::IsRowMajor)
|
||||||
@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
|
|||||||
{
|
{
|
||||||
// FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
|
// FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
|
||||||
tempVector.restart();
|
tempVector.restart();
|
||||||
Scalar x = rhsIt.value();
|
RhsScalar x = rhsIt.value();
|
||||||
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
|
for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
|
||||||
{
|
{
|
||||||
tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
|
tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.startVec(j);
|
res.startVec(j);
|
||||||
for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
|
for (typename AmbiVector<ResScalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
|
||||||
res.insertBackByOuterInner(j,it.index()) = it.value();
|
res.insertBackByOuterInner(j,it.index()) = it.value();
|
||||||
}
|
}
|
||||||
res.finalize();
|
res.finalize();
|
||||||
@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector;
|
|||||||
template<typename Lhs, typename Rhs, typename ResultType>
|
template<typename Lhs, typename Rhs, typename ResultType>
|
||||||
struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
|
struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
|
||||||
{
|
{
|
||||||
typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
|
|
||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
|
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
|
|||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
|
typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
|
typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
|
||||||
ColMajorMatrixLhs colLhs(lhs);
|
ColMajorMatrixLhs colLhs(lhs);
|
||||||
ColMajorMatrixRhs colRhs(rhs);
|
ColMajorMatrixRhs colRhs(rhs);
|
||||||
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
|
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
|
||||||
@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
|
|||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
|
typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
|
||||||
RowMajorMatrixLhs rowLhs(lhs);
|
RowMajorMatrixLhs rowLhs(lhs);
|
||||||
sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
|
sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
|
||||||
}
|
}
|
||||||
@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
|
|||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
|
typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
|
||||||
RowMajorMatrixRhs rowRhs(rhs);
|
RowMajorMatrixRhs rowRhs(rhs);
|
||||||
sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
|
sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
|
||||||
}
|
}
|
||||||
@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
|
|||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
|
typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
|
||||||
ColMajorMatrixRhs colRhs(rhs);
|
ColMajorMatrixRhs colRhs(rhs);
|
||||||
internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
|
internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
|
||||||
}
|
}
|
||||||
@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
|
|||||||
typedef typename ResultType::RealScalar RealScalar;
|
typedef typename ResultType::RealScalar RealScalar;
|
||||||
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
|
||||||
{
|
{
|
||||||
typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
|
typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
|
||||||
ColMajorMatrixLhs colLhs(lhs);
|
ColMajorMatrixLhs colLhs(lhs);
|
||||||
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
|
internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
|
||||||
}
|
}
|
||||||
|
@ -55,9 +55,7 @@ ivcSize(const Indices& indices) const {
|
|||||||
|
|
||||||
template<typename RowIndices, typename ColIndices>
|
template<typename RowIndices, typename ColIndices>
|
||||||
struct valid_indexed_view_overload {
|
struct valid_indexed_view_overload {
|
||||||
// Here we use is_convertible to Index instead of is_integral in order to treat enums as Index.
|
enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
|
||||||
// In c++11 we could use is_integral<T> && is_enum<T> if is_convertible appears to be too permissive.
|
|
||||||
enum { value = !(internal::is_convertible<RowIndices,Index>::value && internal::is_convertible<ColIndices,Index>::value) };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -146,7 +144,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col
|
|||||||
|
|
||||||
template<typename Indices>
|
template<typename Indices>
|
||||||
typename internal::enable_if<
|
typename internal::enable_if<
|
||||||
IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
|
IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
|
||||||
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
|
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
|
||||||
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
||||||
{
|
{
|
||||||
@ -157,7 +155,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
|||||||
|
|
||||||
template<typename Indices>
|
template<typename Indices>
|
||||||
typename internal::enable_if<
|
typename internal::enable_if<
|
||||||
(!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
|
(!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
|
||||||
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
|
IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
|
||||||
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
||||||
{
|
{
|
||||||
@ -168,7 +166,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
|||||||
|
|
||||||
template<typename Indices>
|
template<typename Indices>
|
||||||
typename internal::enable_if<
|
typename internal::enable_if<
|
||||||
(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_integral<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value),
|
(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value),
|
||||||
VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
|
VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
|
||||||
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
|
||||||
{
|
{
|
||||||
@ -250,6 +248,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
|
|||||||
*
|
*
|
||||||
* For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
|
* For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
|
||||||
*
|
*
|
||||||
|
* See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
|
||||||
|
*
|
||||||
* \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
|
* \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
|
||||||
*/
|
*/
|
||||||
template<typename RowIndices, typename ColIndices>
|
template<typename RowIndices, typename ColIndices>
|
||||||
|
@ -20,9 +20,6 @@
|
|||||||
|
|
||||||
#include <math_constants.h>
|
#include <math_constants.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include "cuda_common.h"
|
#include "cuda_common.h"
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ using Eigen::half;
|
|||||||
|
|
||||||
void test_conversion()
|
void test_conversion()
|
||||||
{
|
{
|
||||||
using Eigen::half_impl::__half;
|
using Eigen::half_impl::__half_raw;
|
||||||
|
|
||||||
// Conversion from float.
|
// Conversion from float.
|
||||||
VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
|
||||||
@ -37,9 +37,9 @@ void test_conversion()
|
|||||||
VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
|
VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
|
||||||
|
|
||||||
// Verify round-to-nearest-even behavior.
|
// Verify round-to-nearest-even behavior.
|
||||||
float val1 = float(half(__half(0x3c00)));
|
float val1 = float(half(__half_raw(0x3c00)));
|
||||||
float val2 = float(half(__half(0x3c01)));
|
float val2 = float(half(__half_raw(0x3c01)));
|
||||||
float val3 = float(half(__half(0x3c02)));
|
float val3 = float(half(__half_raw(0x3c02)));
|
||||||
VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
|
||||||
VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
|
VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
|
||||||
|
|
||||||
@ -55,21 +55,21 @@ void test_conversion()
|
|||||||
VERIFY_IS_EQUAL(half(true).x, 0x3c00);
|
VERIFY_IS_EQUAL(half(true).x, 0x3c00);
|
||||||
|
|
||||||
// Conversion to float.
|
// Conversion to float.
|
||||||
VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
|
VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
|
||||||
VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
|
VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f);
|
||||||
|
|
||||||
// Denormals.
|
// Denormals.
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f);
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f);
|
||||||
VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
|
VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f);
|
||||||
|
|
||||||
// NaNs and infinities.
|
// NaNs and infinities.
|
||||||
VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
|
VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
|
||||||
VERIFY(!(numext::isnan)(float(half(0.0f))));
|
VERIFY(!(numext::isnan)(float(half(0.0f))));
|
||||||
VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
|
VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
|
||||||
VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
|
VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
|
||||||
VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
|
VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
|
||||||
VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
|
VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
|
||||||
|
|
||||||
#if !EIGEN_COMP_MSVC
|
#if !EIGEN_COMP_MSVC
|
||||||
// Visual Studio errors out on divisions by 0
|
// Visual Studio errors out on divisions by 0
|
||||||
@ -79,12 +79,12 @@ void test_conversion()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Exactly same checks as above, just directly on the half representation.
|
// Exactly same checks as above, just directly on the half representation.
|
||||||
VERIFY(!(numext::isinf)(half(__half(0x7bff))));
|
VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
|
||||||
VERIFY(!(numext::isnan)(half(__half(0x0000))));
|
VERIFY(!(numext::isnan)(half(__half_raw(0x0000))));
|
||||||
VERIFY((numext::isinf)(half(__half(0xfc00))));
|
VERIFY((numext::isinf)(half(__half_raw(0xfc00))));
|
||||||
VERIFY((numext::isnan)(half(__half(0xfc01))));
|
VERIFY((numext::isnan)(half(__half_raw(0xfc01))));
|
||||||
VERIFY((numext::isinf)(half(__half(0x7c00))));
|
VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
|
||||||
VERIFY((numext::isnan)(half(__half(0x7c01))));
|
VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
|
||||||
|
|
||||||
#if !EIGEN_COMP_MSVC
|
#if !EIGEN_COMP_MSVC
|
||||||
// Visual Studio errors out on divisions by 0
|
// Visual Studio errors out on divisions by 0
|
||||||
|
@ -366,6 +366,11 @@ void check_indexed_view()
|
|||||||
VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) );
|
VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check compilation of enums as index type:
|
||||||
|
enum { X=0, Y=1 };
|
||||||
|
a(X) = 1;
|
||||||
|
A(X,Y) = 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_indexed_view()
|
void test_indexed_view()
|
||||||
|
13
test/main.h
13
test/main.h
@ -50,6 +50,19 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Same for cuda_fp16.h
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||||
|
#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||||
|
#elif defined(__CUDACC_VER__)
|
||||||
|
#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__
|
||||||
|
#else
|
||||||
|
#define EIGEN_TEST_CUDACC_VER 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if EIGEN_TEST_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
// To test that all calls from Eigen code to std::min() and std::max() are
|
// To test that all calls from Eigen code to std::min() and std::max() are
|
||||||
// protected by parenthesis against macro expansion, the min()/max() macros
|
// protected by parenthesis against macro expansion, the min()/max() macros
|
||||||
// are defined here and any not-parenthesized min/max call will cause a
|
// are defined here and any not-parenthesized min/max call will cause a
|
||||||
|
@ -371,6 +371,88 @@ void bug_942()
|
|||||||
VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res );
|
VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Real>
|
||||||
|
void test_mixing_types()
|
||||||
|
{
|
||||||
|
typedef std::complex<Real> Cplx;
|
||||||
|
typedef SparseMatrix<Real> SpMatReal;
|
||||||
|
typedef SparseMatrix<Cplx> SpMatCplx;
|
||||||
|
typedef SparseMatrix<Cplx,RowMajor> SpRowMatCplx;
|
||||||
|
typedef Matrix<Real,Dynamic,Dynamic> DenseMatReal;
|
||||||
|
typedef Matrix<Cplx,Dynamic,Dynamic> DenseMatCplx;
|
||||||
|
|
||||||
|
Index n = internal::random<Index>(1,100);
|
||||||
|
double density = (std::max)(8./(n*n), 0.2);
|
||||||
|
|
||||||
|
SpMatReal sR1(n,n);
|
||||||
|
SpMatCplx sC1(n,n), sC2(n,n), sC3(n,n);
|
||||||
|
SpRowMatCplx sCR(n,n);
|
||||||
|
DenseMatReal dR1(n,n);
|
||||||
|
DenseMatCplx dC1(n,n), dC2(n,n), dC3(n,n);
|
||||||
|
|
||||||
|
initSparse<Real>(density, dR1, sR1);
|
||||||
|
initSparse<Cplx>(density, dC1, sC1);
|
||||||
|
initSparse<Cplx>(density, dC2, sC2);
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
|
||||||
|
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1 * sC1).pruned(), dC3 = dR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1 * sC1).pruned(), dC3 = dR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
|
||||||
|
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sR1 * sC1), dC3 = dR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast<Cplx>().transpose() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast<Cplx>() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast<Cplx>().transpose() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast<Cplx>().transpose() * dC1.transpose() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast<Cplx>().transpose() );
|
||||||
|
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( dC2 = dR1 * sC1, dC3 = dR1.template cast<Cplx>() * sC1 );
|
||||||
|
VERIFY_IS_APPROX( dC2 = sR1 * dC1, dC3 = sR1.template cast<Cplx>() * dC1 );
|
||||||
|
VERIFY_IS_APPROX( dC2 = dC1 * sR1, dC3 = dC1 * sR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = sC1 * dR1, dC3 = sC1 * dR1.template cast<Cplx>() );
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX( dC2 = dR1.row(0) * sC1, dC3 = dR1.template cast<Cplx>().row(0) * sC1 );
|
||||||
|
VERIFY_IS_APPROX( dC2 = sR1 * dC1.col(0), dC3 = sR1.template cast<Cplx>() * dC1.col(0) );
|
||||||
|
VERIFY_IS_APPROX( dC2 = dC1.row(0) * sR1, dC3 = dC1.row(0) * sR1.template cast<Cplx>() );
|
||||||
|
VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) );
|
||||||
|
}
|
||||||
|
|
||||||
void test_sparse_product()
|
void test_sparse_product()
|
||||||
{
|
{
|
||||||
for(int i = 0; i < g_repeat; i++) {
|
for(int i = 0; i < g_repeat; i++) {
|
||||||
@ -381,5 +463,7 @@ void test_sparse_product()
|
|||||||
CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) );
|
CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) );
|
||||||
CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) );
|
CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) );
|
||||||
CALL_SUBTEST_4( (sparse_product_regression_test<SparseMatrix<double,RowMajor>, Matrix<double, Dynamic, Dynamic, RowMajor> >()) );
|
CALL_SUBTEST_4( (sparse_product_regression_test<SparseMatrix<double,RowMajor>, Matrix<double, Dynamic, Dynamic, RowMajor> >()) );
|
||||||
|
|
||||||
|
CALL_SUBTEST_5( (test_mixing_types<float>()) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
// the sum across all big k blocks of the product of little k block of index (x, y)
|
// the sum across all big k blocks of the product of little k block of index (x, y)
|
||||||
// with block of index (y, z). To compute the final output, we need to reduce
|
// with block of index (y, z). To compute the final output, we need to reduce
|
||||||
// the 8 threads over y by summation.
|
// the 8 threads over y by summation.
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
||||||
|
#else
|
||||||
|
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define reduceRow(i, mask) \
|
#define reduceRow(i, mask) \
|
||||||
shuffleInc(i, 0, mask); \
|
shuffleInc(i, 0, mask); \
|
||||||
@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
x1 = rhs_pf0.x;
|
x1 = rhs_pf0.x;
|
||||||
x2 = rhs_pf0.z;
|
x2 = rhs_pf0.z;
|
||||||
}
|
}
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
x1 = __shfl_xor(x1, 4);
|
x1 = __shfl_xor(x1, 4);
|
||||||
x2 = __shfl_xor(x2, 4);
|
x2 = __shfl_xor(x2, 4);
|
||||||
|
#else
|
||||||
|
x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
|
||||||
|
x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
|
||||||
|
#endif
|
||||||
if((threadIdx.x%8) < 4) {
|
if((threadIdx.x%8) < 4) {
|
||||||
rhs_pf0.y = x1;
|
rhs_pf0.y = x1;
|
||||||
rhs_pf0.w = x2;
|
rhs_pf0.w = x2;
|
||||||
|
@ -174,8 +174,10 @@ class TensorCostModel {
|
|||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
|
||||||
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
|
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
|
||||||
double cost = totalCost(output_size, cost_per_coeff);
|
double cost = totalCost(output_size, cost_per_coeff);
|
||||||
int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
|
double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
|
||||||
return numext::mini(max_threads, numext::maxi(1, threads));
|
// Make sure we don't invoke undefined behavior when we convert to an int.
|
||||||
|
threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
|
||||||
|
return numext::mini(max_threads, numext::maxi<int>(1, threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskSize assesses parallel task size.
|
// taskSize assesses parallel task size.
|
||||||
|
@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
|||||||
else {
|
else {
|
||||||
assert(0 && "Wordsize not supported");
|
assert(0 && "Wordsize not supported");
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
// We extend atomicExch to support extra data types
|
// We extend atomicExch to support extra data types
|
||||||
@ -104,9 +104,9 @@ template <>
|
|||||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||||
#if EIGEN_CUDA_ARCH >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
atomicAdd(output, accum);
|
atomicAdd(output, accum);
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
|
#else
|
||||||
|
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
// Let the last block reset the semaphore
|
// Let the last block reset the semaphore
|
||||||
atomicInc(semaphore, gridDim.x + 1);
|
atomicInc(semaphore, gridDim.x + 1);
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -223,13 +227,15 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
||||||
|
|
||||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||||
if (gridDim.x == 1 && first_index == 0) {
|
if (gridDim.x == 1) {
|
||||||
|
if (first_index == 0) {
|
||||||
if (num_coeffs % 2 != 0) {
|
if (num_coeffs % 2 != 0) {
|
||||||
half last = input.m_impl.coeff(num_coeffs-1);
|
half last = input.m_impl.coeff(num_coeffs-1);
|
||||||
*scratch = __halves2half2(last, reducer.initialize());
|
*scratch = __halves2half2(last, reducer.initialize());
|
||||||
} else {
|
} else {
|
||||||
*scratch = reducer.template initializePacket<half2>();
|
*scratch = reducer.template initializePacket<half2>();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,21 +250,27 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||||
|
#else
|
||||||
|
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp), &accum);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
atomicReduce(scratch, accum, reducer);
|
atomicReduce(scratch, accum, reducer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (gridDim.x == 1) {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
if (first_index == 0) {
|
||||||
if (gridDim.x == 1 && first_index == 0) {
|
|
||||||
half tmp = __low2half(*scratch);
|
half tmp = __low2half(*scratch);
|
||||||
reducer.reduce(__high2half(*scratch), &tmp);
|
reducer.reduce(__high2half(*scratch), &tmp);
|
||||||
*output = tmp;
|
*output = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Op>
|
template <typename Op>
|
||||||
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
||||||
@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||||
|
#else
|
||||||
|
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||||
@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // __CUDA_ARCH__ >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
||||||
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
||||||
|
#else
|
||||||
|
int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
|
||||||
|
int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
|
||||||
|
reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
half val1 = __low2half(reduced_val1);
|
half val1 = __low2half(reduced_val1);
|
||||||
|
@ -341,7 +341,7 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
|
|||||||
|
|
||||||
// set from a vector of Euler angles
|
// set from a vector of Euler angles
|
||||||
template<class System, class Other>
|
template<class System, class Other>
|
||||||
struct eulerangles_assign_impl<System,Other,4,1>
|
struct eulerangles_assign_impl<System,Other,3,1>
|
||||||
{
|
{
|
||||||
typedef typename Other::Scalar Scalar;
|
typedef typename Other::Scalar Scalar;
|
||||||
static void run(EulerAngles<Scalar, System>& e, const Other& vec)
|
static void run(EulerAngles<Scalar, System>& e, const Other& vec)
|
||||||
|
@ -279,6 +279,9 @@ void test_EulerAngles()
|
|||||||
EulerAnglesXYZf onesEf = onesEd.cast<float>();
|
EulerAnglesXYZf onesEf = onesEd.cast<float>();
|
||||||
VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
|
VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
|
||||||
|
|
||||||
|
// Simple Construction from Vector3 test
|
||||||
|
VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
|
||||||
|
|
||||||
CALL_SUBTEST_1( eulerangles_manual<float>() );
|
CALL_SUBTEST_1( eulerangles_manual<float>() );
|
||||||
CALL_SUBTEST_2( eulerangles_manual<double>() );
|
CALL_SUBTEST_2( eulerangles_manual<double>() );
|
||||||
|
|
||||||
|
@ -12,12 +12,15 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template <int Layout>
|
template <int Layout>
|
||||||
|
@ -13,12 +13,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_conversion() {
|
void test_cuda_conversion() {
|
||||||
|
@ -11,12 +11,15 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex
|
#define EIGEN_TEST_FUNC cxx11_tensor_complex
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_nullary() {
|
void test_cuda_nullary() {
|
||||||
|
@ -11,12 +11,15 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -14,12 +14,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
@ -12,12 +12,15 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
void test_cuda_nullary() {
|
void test_cuda_nullary() {
|
||||||
|
@ -13,12 +13,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
using Eigen::RowMajor;
|
using Eigen::RowMajor;
|
||||||
|
|
||||||
|
@ -13,12 +13,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
|
|
||||||
template<typename>
|
template<typename>
|
||||||
|
@ -13,12 +13,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <Eigen/CXX11/Tensor>
|
#include <Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
void test_cuda_random_uniform()
|
void test_cuda_random_uniform()
|
||||||
{
|
{
|
||||||
|
@ -12,12 +12,15 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if dEIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template<typename Type, int DataLayout>
|
template<typename Type, int DataLayout>
|
||||||
static void test_full_reductions() {
|
static void test_full_reductions() {
|
||||||
|
@ -13,12 +13,15 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#endif
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
// The EIGEN_CUDACC_VER macro is provided by
|
||||||
|
// unsupported/Eigen/CXX11/Tensor included above
|
||||||
|
#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using Eigen::Tensor;
|
using Eigen::Tensor;
|
||||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user