mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 12:46:00 +08:00
merging updates from upstream
This commit is contained in:
commit
38807a2575
26
Eigen/Core
26
Eigen/Core
@ -56,9 +56,10 @@
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
#undef EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#ifdef EIGEN_CUDACC
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#ifdef EIGEN_CUDACC
|
||||
// Do not try to vectorize on CUDA and SYCL!
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
@ -69,7 +70,19 @@
|
||||
// works properly on the device side
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
#if EIGEN_HAS_CONSTEXPR
|
||||
// While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
|
||||
#if defined(__NVCC__)
|
||||
// nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
|
||||
#ifdef __CUDACC_RELAXED_CONSTEXPR__
|
||||
#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
|
||||
#endif
|
||||
#elif defined(__clang__) && defined(__CUDA__)
|
||||
// clang++ always considers constexpr functions as implicitly __host__ __device__
|
||||
#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
|
||||
#endif
|
||||
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
// Do not try to vectorize on HIP
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
@ -88,15 +101,14 @@
|
||||
// for __HIP_DEVICE_COMPILE__
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
|
||||
#ifdef __NVCC__
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
0
Eigen/PardisoSupport
Executable file → Normal file
0
Eigen/PardisoSupport
Executable file → Normal file
@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline void evalTo(Dest& dst) const { dst = m_expression; }
|
||||
|
||||
const typename internal::remove_all<NestedExpressionType>::type&
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<NestedExpressionType>::type&
|
||||
nestedExpression() const
|
||||
{
|
||||
return m_expression;
|
||||
|
@ -1080,7 +1080,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
: m_argImpl(block.nestedExpression()),
|
||||
m_startRow(block.startRow()),
|
||||
m_startCol(block.startCol()),
|
||||
m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
|
||||
m_linear_offset((InnerPanel|| XprType::IsVectorAtCompileTime)?(XprType::IsRowMajor ? block.startRow()*block.cols() + block.startCol() : block.startCol()*block.rows() + block.startRow()):0)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
@ -1088,7 +1088,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
|
||||
enum {
|
||||
RowsAtCompileTime = XprType::RowsAtCompileTime,
|
||||
ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
|
||||
ForwardLinearAccess = (InnerPanel || XprType::IsVectorAtCompileTime) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
@ -1162,7 +1162,7 @@ protected:
|
||||
evaluator<ArgType> m_argImpl;
|
||||
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
|
||||
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
|
||||
const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
|
||||
const variable_if_dynamic<Index, (InnerPanel || XprType::IsVectorAtCompileTime) ? Dynamic : 0> m_linear_offset;
|
||||
};
|
||||
|
||||
// TODO: This evaluator does not actually use the child evaluator;
|
||||
|
@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
EIGEN_UNUSED_VARIABLE(cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data, other.m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
|
||||
@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
|
||||
@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
|
||||
@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
|
||||
void resize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
|
||||
EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
|
||||
};
|
||||
@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_rows, other.m_rows);
|
||||
swap(m_cols, other.m_cols);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_rows, other.m_rows);
|
||||
numext::swap(m_cols, other.m_cols);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
void conservativeResize(Index size, Index rows, Index cols)
|
||||
@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_cols, other.m_cols);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_cols, other.m_cols);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
|
||||
@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_rows, other.m_rows);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_rows, other.m_rows);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
|
||||
void conservativeResize(Index size, Index rows, Index)
|
||||
|
@ -163,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
|
||||
template<typename Scalar,int Size,int MaxSize>
|
||||
struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
|
||||
};
|
||||
|
||||
template<typename Scalar,int Size>
|
||||
struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* data() { return 0; }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
|
||||
};
|
||||
|
||||
template<typename Scalar,int Size,int MaxSize>
|
||||
|
@ -869,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
@ -886,19 +886,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
return max EIGEN_NOT_A_MACRO (x,y);
|
||||
}
|
||||
|
||||
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
template<typename T>
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
{
|
||||
|
||||
return y < x ? y : x;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
{
|
||||
|
||||
return x < y ? y : x;
|
||||
}
|
||||
|
||||
@ -942,7 +939,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo
|
||||
return cl::sycl::max(x,y);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
|
||||
{
|
||||
return cl::sycl::fmin(x,y);
|
||||
@ -976,6 +972,19 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
|
||||
{
|
||||
return fminf(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
|
||||
{
|
||||
return fmin(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
|
||||
{
|
||||
return fminl(x, y);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
@ -988,6 +997,18 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
|
||||
{
|
||||
return fmaxf(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
|
||||
{
|
||||
return fmax(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
|
||||
{
|
||||
return fmaxl(x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -67,7 +67,7 @@ T generic_fast_tanh_float(const T& a_x)
|
||||
}
|
||||
|
||||
template<typename RealScalar>
|
||||
EIGEN_STRONG_INLINE
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(sqrt);
|
||||
@ -82,7 +82,8 @@ template<typename Scalar>
|
||||
struct hypot_impl
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
static inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return positive_real_hypot<RealScalar>(abs(x), abs(y));
|
||||
|
@ -328,6 +328,7 @@ template<typename Derived> class MatrixBase
|
||||
|
||||
inline const PartialPivLU<PlainObject> lu() const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Inverse<Derived> inverse() const;
|
||||
|
||||
template<typename ResultType>
|
||||
@ -337,12 +338,15 @@ template<typename Derived> class MatrixBase
|
||||
bool& invertible,
|
||||
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
|
||||
) const;
|
||||
|
||||
template<typename ResultType>
|
||||
inline void computeInverseWithCheck(
|
||||
ResultType& inverse,
|
||||
bool& invertible,
|
||||
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
|
||||
) const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
Scalar determinant() const;
|
||||
|
||||
/////////// Cholesky module ///////////
|
||||
@ -414,15 +418,19 @@ template<typename Derived> class MatrixBase
|
||||
|
||||
////////// Householder module ///////////
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeHouseholder(EssentialPart& essential,
|
||||
Scalar& tau, RealScalar& beta) const;
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyHouseholderOnTheLeft(const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
Scalar* workspace);
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyHouseholderOnTheRight(const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
Scalar* workspace);
|
||||
|
@ -21,12 +21,14 @@ template< typename T,
|
||||
bool is_integer = NumTraits<T>::IsInteger>
|
||||
struct default_digits10_impl
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return std::numeric_limits<T>::digits10; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_digits10_impl<T,false,false> // Floating point
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() {
|
||||
using std::log10;
|
||||
using std::ceil;
|
||||
@ -38,6 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point
|
||||
template<typename T>
|
||||
struct default_digits10_impl<T,false,true> // Integer
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return 0; }
|
||||
};
|
||||
|
||||
@ -49,12 +52,14 @@ template< typename T,
|
||||
bool is_integer = NumTraits<T>::IsInteger>
|
||||
struct default_digits_impl
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return std::numeric_limits<T>::digits; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_digits_impl<T,false,false> // Floating point
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() {
|
||||
using std::log;
|
||||
using std::ceil;
|
||||
@ -66,6 +71,7 @@ struct default_digits_impl<T,false,false> // Floating point
|
||||
template<typename T>
|
||||
struct default_digits_impl<T,false,true> // Integer
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return 0; }
|
||||
};
|
||||
|
||||
|
@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
|
||||
#endif
|
||||
|
||||
/** \returns the number of rows */
|
||||
inline Index rows() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
|
||||
|
||||
/** \returns the number of columns */
|
||||
inline Index cols() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
|
||||
|
||||
/** \returns the size of a side of the respective square matrix, i.e., the number of indices */
|
||||
inline Index size() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename DenseDerived>
|
||||
|
@ -127,7 +127,7 @@ public:
|
||||
using Base::derived;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
EIGEN_STRONG_INLINE operator const Scalar() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
|
||||
{
|
||||
return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
|
||||
}
|
||||
|
@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
|
||||
{
|
||||
evaluator<Rhs> rhsEval(rhs);
|
||||
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
|
||||
ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
|
||||
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
|
||||
// FIXME not very good if rhs is real and lhs complex while alpha is real too
|
||||
const Index cols = dst.cols();
|
||||
@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
|
||||
{
|
||||
evaluator<Lhs> lhsEval(lhs);
|
||||
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
|
||||
ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
|
||||
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
|
||||
// FIXME not very good if lhs is real and rhs complex while alpha is real too
|
||||
const Index rows = dst.rows();
|
||||
@ -411,6 +411,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
|
||||
// dst {,+,-}= s * (A.lazyProduct(B))
|
||||
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
|
||||
// For them, this strategy is also faster than simply by-passing the heap allocation through
|
||||
// stack allocation.
|
||||
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
|
||||
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
|
||||
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
|
||||
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
|
||||
// overload more specialized.
|
||||
template<typename Dst, typename LhsT, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
|
||||
// template<typename Dst>
|
||||
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
// { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
|
||||
@ -741,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
|
||||
}
|
||||
@ -785,7 +812,11 @@ public:
|
||||
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
|
||||
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
|
||||
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
|
||||
Alignment = evaluator<MatrixType>::Alignment
|
||||
Alignment = evaluator<MatrixType>::Alignment,
|
||||
|
||||
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
|
||||
};
|
||||
|
||||
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
|
||||
@ -797,6 +828,9 @@ public:
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
|
||||
{
|
||||
if(AsScalarProduct)
|
||||
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
|
||||
else
|
||||
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
|
||||
}
|
||||
|
||||
|
@ -23,22 +23,22 @@ namespace internal {
|
||||
* Part 1 : the logic deciding a strategy for vectorization and unrolling
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_traits
|
||||
{
|
||||
public:
|
||||
typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
|
||||
typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
|
||||
enum {
|
||||
PacketSize = unpacket_traits<PacketType>::size,
|
||||
InnerMaxSize = int(Derived::IsRowMajor)
|
||||
? Derived::MaxColsAtCompileTime
|
||||
: Derived::MaxRowsAtCompileTime
|
||||
InnerMaxSize = int(Evaluator::IsRowMajor)
|
||||
? Evaluator::MaxColsAtCompileTime
|
||||
: Evaluator::MaxRowsAtCompileTime
|
||||
};
|
||||
|
||||
enum {
|
||||
MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
|
||||
MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
|
||||
&& (functor_traits<Func>::PacketAccess),
|
||||
MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
|
||||
MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
|
||||
MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
|
||||
};
|
||||
|
||||
@ -51,8 +51,8 @@ public:
|
||||
|
||||
public:
|
||||
enum {
|
||||
Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
|
||||
: Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
|
||||
Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
|
||||
: Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
|
||||
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
|
||||
};
|
||||
|
||||
@ -64,9 +64,9 @@ public:
|
||||
#ifdef EIGEN_DEBUG_ASSIGN
|
||||
static void debug()
|
||||
{
|
||||
std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
|
||||
std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
|
||||
std::cerr.setf(std::ios::hex, std::ios::basefield);
|
||||
EIGEN_DEBUG_VAR(Derived::Flags)
|
||||
EIGEN_DEBUG_VAR(Evaluator::Flags)
|
||||
std::cerr.unsetf(std::ios::hex);
|
||||
EIGEN_DEBUG_VAR(InnerMaxSize)
|
||||
EIGEN_DEBUG_VAR(PacketSize)
|
||||
@ -87,88 +87,88 @@ public:
|
||||
|
||||
/*** no vectorization ***/
|
||||
|
||||
template<typename Func, typename Derived, int Start, int Length>
|
||||
template<typename Func, typename Evaluator, int Start, int Length>
|
||||
struct redux_novec_unroller
|
||||
{
|
||||
enum {
|
||||
HalfLength = Length/2
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
|
||||
{
|
||||
return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
|
||||
redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
|
||||
return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
|
||||
redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_novec_unroller<Func, Derived, Start, 1>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_novec_unroller<Func, Evaluator, Start, 1>
|
||||
{
|
||||
enum {
|
||||
outer = Start / Derived::InnerSizeAtCompileTime,
|
||||
inner = Start % Derived::InnerSizeAtCompileTime
|
||||
outer = Start / Evaluator::InnerSizeAtCompileTime,
|
||||
inner = Start % Evaluator::InnerSizeAtCompileTime
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
|
||||
{
|
||||
return mat.coeffByOuterInner(outer, inner);
|
||||
return eval.coeffByOuterInner(outer, inner);
|
||||
}
|
||||
};
|
||||
|
||||
// This is actually dead code and will never be called. It is required
|
||||
// to prevent false warnings regarding failed inlining though
|
||||
// for 0 length run() will never be called at all.
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_novec_unroller<Func, Derived, Start, 0>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_novec_unroller<Func, Evaluator, Start, 0>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
|
||||
};
|
||||
|
||||
/*** vectorization ***/
|
||||
|
||||
template<typename Func, typename Derived, int Start, int Length>
|
||||
template<typename Func, typename Evaluator, int Start, int Length>
|
||||
struct redux_vec_unroller
|
||||
{
|
||||
enum {
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
|
||||
HalfLength = Length/2
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
|
||||
{
|
||||
return func.packetOp(
|
||||
redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
|
||||
redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
|
||||
redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
|
||||
redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_vec_unroller<Func, Derived, Start, 1>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_vec_unroller<Func, Evaluator, Start, 1>
|
||||
{
|
||||
enum {
|
||||
index = Start * redux_traits<Func, Derived>::PacketSize,
|
||||
outer = index / int(Derived::InnerSizeAtCompileTime),
|
||||
inner = index % int(Derived::InnerSizeAtCompileTime),
|
||||
alignment = Derived::Alignment
|
||||
index = Start * redux_traits<Func, Evaluator>::PacketSize,
|
||||
outer = index / int(Evaluator::InnerSizeAtCompileTime),
|
||||
inner = index % int(Evaluator::InnerSizeAtCompileTime),
|
||||
alignment = Evaluator::Alignment
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
|
||||
{
|
||||
return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
|
||||
return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
|
||||
}
|
||||
};
|
||||
|
||||
@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
|
||||
* Part 3 : implementation of all cases
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Func, typename Derived,
|
||||
int Traversal = redux_traits<Func, Derived>::Traversal,
|
||||
int Unrolling = redux_traits<Func, Derived>::Unrolling
|
||||
template<typename Func, typename Evaluator,
|
||||
int Traversal = redux_traits<Func, Evaluator>::Traversal,
|
||||
int Unrolling = redux_traits<Func, Evaluator>::Unrolling
|
||||
>
|
||||
struct redux_impl;
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
Scalar res;
|
||||
res = mat.coeffByOuterInner(0, 0);
|
||||
for(Index i = 1; i < mat.innerSize(); ++i)
|
||||
res = func(res, mat.coeffByOuterInner(0, i));
|
||||
for(Index i = 1; i < mat.outerSize(); ++i)
|
||||
for(Index j = 0; j < mat.innerSize(); ++j)
|
||||
res = func(res, mat.coeffByOuterInner(i, j));
|
||||
res = eval.coeffByOuterInner(0, 0);
|
||||
for(Index i = 1; i < xpr.innerSize(); ++i)
|
||||
res = func(res, eval.coeffByOuterInner(0, i));
|
||||
for(Index i = 1; i < xpr.outerSize(); ++i)
|
||||
for(Index j = 0; j < xpr.innerSize(); ++j)
|
||||
res = func(res, eval.coeffByOuterInner(i, j));
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
|
||||
: public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
|
||||
{};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
|
||||
: redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
|
||||
static Scalar run(const Derived &mat, const Func& func)
|
||||
typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
|
||||
{
|
||||
const Index size = mat.size();
|
||||
return Base::run(eval,func);
|
||||
}
|
||||
};
|
||||
|
||||
const Index packetSize = redux_traits<Func, Derived>::PacketSize;
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
|
||||
{
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
template<typename XprType>
|
||||
static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
const Index size = xpr.size();
|
||||
|
||||
const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
|
||||
const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
|
||||
enum {
|
||||
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
|
||||
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
|
||||
alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
|
||||
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
|
||||
};
|
||||
const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
|
||||
const Index alignedStart = internal::first_default_aligned(xpr);
|
||||
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
|
||||
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
|
||||
const Index alignedEnd2 = alignedStart + alignedSize2;
|
||||
@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
Scalar res;
|
||||
if(alignedSize)
|
||||
{
|
||||
PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
|
||||
PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
|
||||
if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
|
||||
{
|
||||
PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
|
||||
PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
|
||||
for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
|
||||
{
|
||||
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
|
||||
packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
|
||||
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
|
||||
packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
|
||||
}
|
||||
|
||||
packet_res0 = func.packetOp(packet_res0,packet_res1);
|
||||
if(alignedEnd>alignedEnd2)
|
||||
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
|
||||
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
|
||||
}
|
||||
res = func.predux(packet_res0);
|
||||
|
||||
for(Index index = 0; index < alignedStart; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
|
||||
for(Index index = alignedEnd; index < size; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
}
|
||||
else // too small to vectorize anything.
|
||||
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
|
||||
{
|
||||
res = mat.coeff(0);
|
||||
res = eval.coeff(0);
|
||||
for(Index index = 1; index < size; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -265,130 +277,105 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
};
|
||||
|
||||
// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
|
||||
template<typename Func, typename Derived, int Unrolling>
|
||||
struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
|
||||
template<typename Func, typename Evaluator, int Unrolling>
|
||||
struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketType;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
const Index innerSize = mat.innerSize();
|
||||
const Index outerSize = mat.outerSize();
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
const Index innerSize = xpr.innerSize();
|
||||
const Index outerSize = xpr.outerSize();
|
||||
enum {
|
||||
packetSize = redux_traits<Func, Derived>::PacketSize
|
||||
packetSize = redux_traits<Func, Evaluator>::PacketSize
|
||||
};
|
||||
const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
|
||||
Scalar res;
|
||||
if(packetedInnerSize)
|
||||
{
|
||||
PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
|
||||
PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
|
||||
for(Index j=0; j<outerSize; ++j)
|
||||
for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
|
||||
packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
|
||||
packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
|
||||
|
||||
res = func.predux(packet_res);
|
||||
for(Index j=0; j<outerSize; ++j)
|
||||
for(Index i=packetedInnerSize; i<innerSize; ++i)
|
||||
res = func(res, mat.coeffByOuterInner(j,i));
|
||||
res = func(res, eval.coeffByOuterInner(j,i));
|
||||
}
|
||||
else // too small to vectorize anything.
|
||||
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
|
||||
{
|
||||
res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
|
||||
res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
enum {
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
Size = Derived::SizeAtCompileTime,
|
||||
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
|
||||
Size = Evaluator::SizeAtCompileTime,
|
||||
VectorizedSize = (Size / PacketSize) * PacketSize
|
||||
};
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
if (VectorizedSize > 0) {
|
||||
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
|
||||
Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
|
||||
if (VectorizedSize != Size)
|
||||
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
|
||||
res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
|
||||
return res;
|
||||
}
|
||||
else {
|
||||
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
|
||||
return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// evaluator adaptor
|
||||
template<typename _XprType>
|
||||
class redux_evaluator
|
||||
class redux_evaluator : public internal::evaluator<_XprType>
|
||||
{
|
||||
typedef internal::evaluator<_XprType> Base;
|
||||
public:
|
||||
typedef _XprType XprType;
|
||||
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
|
||||
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
|
||||
enum {
|
||||
MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
|
||||
// TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
|
||||
Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
|
||||
Flags = Base::Flags & ~DirectAccessBit,
|
||||
IsRowMajor = XprType::IsRowMajor,
|
||||
SizeAtCompileTime = XprType::SizeAtCompileTime,
|
||||
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
|
||||
CoeffReadCost = evaluator<XprType>::CoeffReadCost,
|
||||
Alignment = evaluator<XprType>::Alignment
|
||||
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
|
||||
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
|
||||
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
|
||||
EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
|
||||
EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeff(Index row, Index col) const
|
||||
{ return m_evaluator.coeff(row, col); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeff(Index index) const
|
||||
{ return m_evaluator.coeff(index); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packet(Index row, Index col) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packet(Index index) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
{ return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packetByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
{ return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
const XprType & nestedExpression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
internal::evaluator<XprType> m_evaluator;
|
||||
const XprType &m_xpr;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
@ -415,7 +402,9 @@ DenseBase<Derived>::redux(const Func& func) const
|
||||
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
|
||||
ThisEvaluator thisEval(derived());
|
||||
|
||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
|
||||
// The initial expression is passed to the reducer as an additional argument instead of
|
||||
// passing it as a member of redux_evaluator to help
|
||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
|
||||
}
|
||||
|
||||
/** \returns the minimum of all coefficients of \c *this.
|
||||
|
@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
|
||||
nestedExpression() { return m_matrix; }
|
||||
|
||||
/** \internal */
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(Index nrows, Index ncols) {
|
||||
m_matrix.resize(ncols,nrows);
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
|
||||
inline Index innerStride() const { return derived().innerStride(); }
|
||||
|
||||
// dummy resize function
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(Index rows, Index cols)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
@ -716,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
|
||||
{
|
||||
typedef TriangularView<MatrixType,Mode> XprType;
|
||||
typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
|
||||
EIGEN_DEVICE_FUNC
|
||||
unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
|
||||
};
|
||||
|
||||
|
@ -1054,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
|
||||
Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
|
||||
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
|
||||
Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
#endif // __VSX__
|
||||
|
@ -497,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
|
||||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
HasHalfPacket = 0,
|
||||
HasAdd = 0,
|
||||
HasSub = 0,
|
||||
HasMul = 0,
|
||||
HasNegate = 0,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
@ -549,6 +549,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet1
|
||||
_mm256_storeu_si256((__m256i*)to, from.x);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h
|
||||
ploaddup<Packet16h>(const Eigen::half* from) {
|
||||
Packet16h result;
|
||||
unsigned short a = from[0].x;
|
||||
unsigned short b = from[1].x;
|
||||
unsigned short c = from[2].x;
|
||||
unsigned short d = from[3].x;
|
||||
unsigned short e = from[4].x;
|
||||
unsigned short f = from[5].x;
|
||||
unsigned short g = from[6].x;
|
||||
unsigned short h = from[7].x;
|
||||
result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h
|
||||
ploadquad(const Eigen::half* from) {
|
||||
Packet16h result;
|
||||
@ -621,6 +636,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
|
||||
// FIXME we could do that with bit manipulation
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f rf = pnegate(af);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
@ -628,6 +650,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
Packet16f rf = psub(af, bf);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
@ -640,6 +669,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
|
||||
return half(predux(from_float));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
|
||||
Packet16f from_float = half2float(from);
|
||||
return half(predux_mul(from_float));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
|
||||
Packet16f pf[16];
|
||||
pf[0] = half2float(p[0]);
|
||||
pf[1] = half2float(p[1]);
|
||||
pf[2] = half2float(p[2]);
|
||||
pf[3] = half2float(p[3]);
|
||||
pf[4] = half2float(p[4]);
|
||||
pf[5] = half2float(p[5]);
|
||||
pf[6] = half2float(p[6]);
|
||||
pf[7] = half2float(p[7]);
|
||||
pf[8] = half2float(p[8]);
|
||||
pf[9] = half2float(p[9]);
|
||||
pf[10] = half2float(p[10]);
|
||||
pf[11] = half2float(p[11]);
|
||||
pf[12] = half2float(p[12]);
|
||||
pf[13] = half2float(p[13]);
|
||||
pf[14] = half2float(p[14]);
|
||||
pf[15] = half2float(p[15]);
|
||||
Packet16f reduced = preduxp<Packet16f>(pf);
|
||||
return float2half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
|
||||
{
|
||||
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
|
||||
Packet16h res;
|
||||
res.x = _mm256_insertf128_si256(
|
||||
_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
|
||||
_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
|
||||
{
|
||||
Packet16h res;
|
||||
res.x = _mm256_insert_epi16(a.x,b.x,0);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
|
||||
{
|
||||
Packet16h res;
|
||||
res.x = _mm256_insert_epi16(a.x,b.x,15);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
|
||||
{
|
||||
Packet16h result;
|
||||
@ -747,20 +827,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
|
||||
|
||||
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
|
||||
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
|
||||
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
|
||||
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
|
||||
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
|
||||
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
|
||||
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
|
||||
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
|
||||
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
|
||||
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
|
||||
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
|
||||
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
|
||||
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
|
||||
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
|
||||
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
|
||||
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
|
||||
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
|
||||
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
|
||||
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
|
||||
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
|
||||
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
|
||||
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
|
||||
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
|
||||
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
|
||||
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
|
||||
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
|
||||
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
|
||||
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
|
||||
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
|
||||
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
|
||||
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
|
||||
|
||||
kernel.packet[0].x = a_p_0;
|
||||
@ -865,10 +945,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
|
||||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasHalfPacket = 0,
|
||||
HasAdd = 0,
|
||||
HasSub = 0,
|
||||
HasMul = 0,
|
||||
HasNegate = 0,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
@ -917,6 +997,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h
|
||||
ploaddup<Packet8h>(const Eigen::half* from) {
|
||||
Packet8h result;
|
||||
unsigned short a = from[0].x;
|
||||
unsigned short b = from[1].x;
|
||||
unsigned short c = from[2].x;
|
||||
unsigned short d = from[3].x;
|
||||
result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h
|
||||
ploadquad<Packet8h>(const Eigen::half* from) {
|
||||
Packet8h result;
|
||||
@ -970,6 +1061,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
|
||||
// FIXME we could do that with bit manipulation
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f rf = pnegate(af);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
@ -977,6 +1075,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
Packet8f rf = psub(af, bf);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
@ -1029,6 +1134,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
|
||||
return Eigen::half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
|
||||
Packet8f pf[8];
|
||||
pf[0] = half2float(p[0]);
|
||||
pf[1] = half2float(p[1]);
|
||||
pf[2] = half2float(p[2]);
|
||||
pf[3] = half2float(p[3]);
|
||||
pf[4] = half2float(p[4]);
|
||||
pf[5] = half2float(p[5]);
|
||||
pf[6] = half2float(p[6]);
|
||||
pf[7] = half2float(p[7]);
|
||||
Packet8f reduced = preduxp<Packet8f>(pf);
|
||||
return float2half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
|
||||
{
|
||||
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
|
||||
Packet8h res;
|
||||
res.x = _mm_shuffle_epi8(a.x,m);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
|
||||
{
|
||||
Packet8h res;
|
||||
res.x = _mm_insert_epi16(a.x,int(b.x),0);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
|
||||
{
|
||||
Packet8h res;
|
||||
res.x = _mm_insert_epi16(a.x,int(b.x),7);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8h>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
|
||||
{
|
||||
if (Offset!=0)
|
||||
first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<Packet8h,8>& kernel) {
|
||||
__m128i a = kernel.packet[0].x;
|
||||
|
@ -1526,10 +1526,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
// The following piece of code won't work for 512 bit registers
|
||||
// Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
|
||||
// as nr (which is currently 4) for the return type.
|
||||
typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
|
||||
const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
|
||||
if ((SwappedTraits::LhsProgress % 4) == 0 &&
|
||||
(SwappedTraits::LhsProgress <= 8) &&
|
||||
(SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
|
||||
(SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
|
||||
{
|
||||
SAccPacket C0, C1, C2, C3;
|
||||
straits.initAcc(C0);
|
||||
|
@ -431,10 +431,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
// to determine the following heuristic.
|
||||
// EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
|
||||
// unless it has been specialized by the user or for a given architecture.
|
||||
// Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
|
||||
// Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
|
||||
// I'm not sure it is still required.
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::evalTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
{
|
||||
dst.setZero();
|
||||
@ -446,7 +446,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::addTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
||||
}
|
||||
@ -455,7 +455,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::subTo(dst, lhs, rhs);
|
||||
lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
else
|
||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||
}
|
||||
|
@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
|
||||
struct selfadjoint_matrix_vector_product
|
||||
|
||||
{
|
||||
static EIGEN_DONT_INLINE void run(
|
||||
static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
|
||||
void run(
|
||||
Index size,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs,
|
||||
@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
|
||||
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
|
||||
void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
|
||||
Index size,
|
||||
const Scalar* lhs, Index lhsStride,
|
||||
const Scalar* rhs,
|
||||
@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
||||
|
||||
Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
|
||||
|
||||
|
||||
Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
|
||||
Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
|
||||
if (FirstTriangular)
|
||||
bound = size - bound;
|
||||
|
||||
@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
|
||||
enum { LhsUpLo = LhsMode&(Upper|Lower) };
|
||||
|
||||
template<typename Dest>
|
||||
static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
|
||||
{
|
||||
typedef typename Dest::Scalar ResScalar;
|
||||
typedef typename Rhs::Scalar RhsScalar;
|
||||
|
@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
|
||||
template<typename Scalar, typename Index, typename UType, typename VType>
|
||||
struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
|
||||
{
|
||||
static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
|
||||
{
|
||||
const Index size = u.size();
|
||||
for (Index i=0; i<size; ++i)
|
||||
|
@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
{
|
||||
// let's directly call the low level product function because:
|
||||
// 1 - it is faster to compile
|
||||
// 2 - it is slighlty faster at runtime
|
||||
// 2 - it is slightly faster at runtime
|
||||
Index startRow = IsLower ? pi : pi-actualPanelWidth;
|
||||
Index startCol = IsLower ? 0 : pi;
|
||||
|
||||
@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
if (k>0)
|
||||
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
|
||||
|
||||
if(!(Mode & UnitDiag))
|
||||
if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
|
||||
rhs[i] /= cjLhs(i,i);
|
||||
}
|
||||
}
|
||||
@ -114,6 +114,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
for(Index k=0; k<actualPanelWidth; ++k)
|
||||
{
|
||||
Index i = IsLower ? pi+k : pi-k-1;
|
||||
if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
|
||||
{
|
||||
if(!(Mode & UnitDiag))
|
||||
rhs[i] /= cjLhs.coeff(i,i);
|
||||
|
||||
@ -122,12 +124,13 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
if (r>0)
|
||||
Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
|
||||
}
|
||||
}
|
||||
Index r = IsLower ? size - endBlock : startBlock; // remaining size
|
||||
if (r > 0)
|
||||
{
|
||||
// let's directly call the low level product function because:
|
||||
// 1 - it is faster to compile
|
||||
// 2 - it is slighlty faster at runtime
|
||||
// 2 - it is slightly faster at runtime
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
|
||||
r, actualPanelWidth,
|
||||
LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
|
||||
|
@ -289,8 +289,8 @@ template<typename XprType> struct blas_traits
|
||||
ExtractType,
|
||||
typename _ExtractType::PlainObject
|
||||
>::type DirectLinearAccessType;
|
||||
static inline ExtractType extract(const XprType& x) { return x; }
|
||||
static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
|
||||
static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
|
||||
static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
|
||||
};
|
||||
|
||||
// pop conjugate
|
||||
@ -318,8 +318,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp
|
||||
typedef blas_traits<NestedXpr> Base;
|
||||
typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
|
||||
typedef typename Base::ExtractType ExtractType;
|
||||
static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
|
||||
static inline Scalar extractScalarFactor(const XprType& x)
|
||||
static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
|
||||
static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
|
||||
{ return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
|
||||
};
|
||||
template<typename Scalar, typename NestedXpr, typename Plain>
|
||||
|
@ -192,7 +192,7 @@ inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
|
||||
// The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
|
||||
// This way a code like fix<N> can only refer to the previous function.
|
||||
template<int N,typename T>
|
||||
inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(val); }
|
||||
inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
|
||||
#endif
|
||||
|
||||
#else // EIGEN_PARSED_BY_DOXYGEN
|
||||
|
@ -580,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
|
||||
|
||||
// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
|
||||
// to the appropriate stack allocation function
|
||||
#ifndef EIGEN_ALLOCA
|
||||
#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH
|
||||
#if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
|
||||
#define EIGEN_ALLOCA alloca
|
||||
#elif EIGEN_COMP_MSVC
|
||||
@ -599,12 +599,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
|
||||
* In this case, the buffer elements will also be destructed when this handler will be destructed.
|
||||
* Finally, if \a dealloc is true, then the pointer \a ptr is freed.
|
||||
**/
|
||||
EIGEN_DEVICE_FUNC
|
||||
aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
|
||||
: m_ptr(ptr), m_size(size), m_deallocate(dealloc)
|
||||
{
|
||||
if(NumTraits<T>::RequireInitialization && m_ptr)
|
||||
Eigen::internal::construct_elements_of_array(m_ptr, size);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
~aligned_stack_memory_handler()
|
||||
{
|
||||
if(NumTraits<T>::RequireInitialization && m_ptr)
|
||||
@ -618,6 +620,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
|
||||
bool m_deallocate;
|
||||
};
|
||||
|
||||
#ifdef EIGEN_ALLOCA
|
||||
|
||||
template<typename Xpr, int NbEvaluations,
|
||||
bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
|
||||
>
|
||||
struct local_nested_eval_wrapper
|
||||
{
|
||||
static const bool NeedExternalBuffer = false;
|
||||
typedef typename Xpr::Scalar Scalar;
|
||||
typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
|
||||
ObjectType object;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(ptr);
|
||||
eigen_internal_assert(ptr==0);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Xpr, int NbEvaluations>
|
||||
struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
|
||||
{
|
||||
static const bool NeedExternalBuffer = true;
|
||||
typedef typename Xpr::Scalar Scalar;
|
||||
typedef typename plain_object_eval<Xpr>::type PlainObject;
|
||||
typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
|
||||
ObjectType object;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
|
||||
: object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
|
||||
m_deallocate(ptr==0)
|
||||
{
|
||||
if(NumTraits<Scalar>::RequireInitialization && object.data())
|
||||
Eigen::internal::construct_elements_of_array(object.data(), object.size());
|
||||
object = xpr;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
~local_nested_eval_wrapper()
|
||||
{
|
||||
if(NumTraits<Scalar>::RequireInitialization && object.data())
|
||||
Eigen::internal::destruct_elements_of_array(object.data(), object.size());
|
||||
if(m_deallocate)
|
||||
Eigen::internal::aligned_free(object.data());
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deallocate;
|
||||
};
|
||||
|
||||
#endif // EIGEN_ALLOCA
|
||||
|
||||
template<typename T> class scoped_array : noncopyable
|
||||
{
|
||||
T* m_ptr;
|
||||
@ -645,9 +701,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
} // end namespace internal
|
||||
|
||||
/** \internal
|
||||
* Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
|
||||
* if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
|
||||
* (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
|
||||
*
|
||||
* The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
|
||||
* and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
|
||||
* if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
|
||||
* (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
|
||||
* The allocated buffer is automatically deleted when exiting the scope of this declaration.
|
||||
* If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
|
||||
* Here is an example:
|
||||
@ -658,6 +716,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
* }
|
||||
* \endcode
|
||||
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
|
||||
*
|
||||
* The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
|
||||
* \code
|
||||
* typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
|
||||
* \endcode
|
||||
* with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
|
||||
* This is accomplished through alloca if this later is supported and if the required number of bytes
|
||||
* is below EIGEN_STACK_ALLOCATION_LIMIT.
|
||||
*/
|
||||
#ifdef EIGEN_ALLOCA
|
||||
|
||||
@ -677,6 +743,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
: Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \
|
||||
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
|
||||
|
||||
|
||||
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
|
||||
Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
|
||||
( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
|
||||
? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
|
||||
typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
|
||||
|
||||
#else
|
||||
|
||||
#define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
|
||||
@ -684,6 +757,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \
|
||||
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
|
||||
|
||||
|
||||
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -583,6 +583,7 @@ using std::numeric_limits;
|
||||
// Integer division with rounding up.
|
||||
// T is assumed to be an integer type with a>=0, and b>0
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
T div_ceil(const T &a, const T &b)
|
||||
{
|
||||
return (a+b-1) / b;
|
||||
@ -593,7 +594,7 @@ T div_ceil(const T &a, const T &b)
|
||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool equal_strict(const X& x,const Y& y) { return x == y; }
|
||||
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
|
||||
|
||||
@ -604,7 +605,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
|
||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool not_equal_strict(const X& x,const Y& y) { return x != y; }
|
||||
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
|
||||
|
||||
|
@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
|
||||
{
|
||||
enum {
|
||||
ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
|
||||
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory?
|
||||
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary?
|
||||
// Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
|
||||
// This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
|
||||
// for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
|
||||
@ -676,17 +676,32 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
|
||||
template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
|
||||
{ enum { ret = true }; };
|
||||
|
||||
|
||||
template<typename T> struct is_identity
|
||||
{ enum { value = false }; };
|
||||
|
||||
template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
|
||||
{ enum { value = true }; };
|
||||
|
||||
|
||||
template<typename S1, typename S2> struct glue_shapes;
|
||||
template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type; };
|
||||
|
||||
template<typename T1, typename T2>
|
||||
bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
|
||||
struct possibly_same_dense {
|
||||
enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
|
||||
};
|
||||
|
||||
template<typename T1, typename T2>
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
|
||||
{
|
||||
return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
|
||||
}
|
||||
|
||||
template<typename T1, typename T2>
|
||||
bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
@ -271,7 +271,12 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
|
||||
m_hess.compute(matrix.derived()/scale);
|
||||
|
||||
// Step 2. Reduce to real Schur form
|
||||
computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
|
||||
// Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
|
||||
// to be able to pass our working-space buffer for the Householder to Dense evaluation.
|
||||
m_workspaceVector.resize(matrix.cols());
|
||||
if(computeU)
|
||||
m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
|
||||
computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
|
||||
|
||||
m_matT *= scale;
|
||||
|
||||
@ -285,8 +290,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
|
||||
|
||||
m_matT = matrixH;
|
||||
m_workspaceVector.resize(m_matT.cols());
|
||||
if(computeU)
|
||||
matrixQ.evalTo(m_matU, m_workspaceVector);
|
||||
if(computeU && !internal::is_same_dense(m_matU,matrixQ))
|
||||
m_matU = matrixQ;
|
||||
|
||||
Index maxIters = m_maxIters;
|
||||
if (maxIters == -1)
|
||||
|
@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver;
|
||||
|
||||
namespace internal {
|
||||
template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
|
||||
|
||||
template<typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
|
||||
}
|
||||
|
||||
@ -354,8 +356,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
|
||||
static const int m_maxIterations = 30;
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
static void check_template_parameters()
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void check_template_parameters()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
|
||||
}
|
||||
@ -404,7 +406,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
|
||||
|
||||
const InputType &matrix(a_matrix.derived());
|
||||
|
||||
using std::abs;
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
eigen_assert(matrix.cols() == matrix.rows());
|
||||
eigen_assert((options&~(EigVecMask|GenEigMask))==0
|
||||
&& (options&EigVecMask)!=EigVecMask
|
||||
@ -480,9 +482,10 @@ namespace internal {
|
||||
* \returns \c Success or \c NoConvergence
|
||||
*/
|
||||
template<typename MatrixType, typename DiagType, typename SubDiagType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
|
||||
{
|
||||
using std::abs;
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
|
||||
ComputationInfo info;
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
@ -536,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
|
||||
diag.segment(i,n-i).minCoeff(&k);
|
||||
if (k > 0)
|
||||
{
|
||||
std::swap(diag[i], diag[k+i]);
|
||||
numext::swap(diag[i], diag[k+i]);
|
||||
if(computeEigenvectors)
|
||||
eivec.col(i).swap(eivec.col(k+i));
|
||||
}
|
||||
@ -606,7 +609,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
|
||||
{
|
||||
using std::abs;
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
Index i0;
|
||||
// Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
|
||||
mat.diagonal().cwiseAbs().maxCoeff(&i0);
|
||||
@ -808,7 +811,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
|
||||
EIGEN_DEVICE_FUNC
|
||||
static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
|
||||
{
|
||||
using std::abs;
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
|
||||
RealScalar e = subdiag[end-1];
|
||||
// Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
|
||||
|
@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
|
||||
};
|
||||
|
||||
template<typename MatrixType, typename CoeffVectorType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
|
||||
}
|
||||
|
||||
@ -344,6 +345,7 @@ namespace internal {
|
||||
* \sa Tridiagonalization::packedMatrix()
|
||||
*/
|
||||
template<typename MatrixType, typename CoeffVectorType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
|
||||
{
|
||||
using numext::conj;
|
||||
@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector;
|
||||
* \sa class Tridiagonalization
|
||||
*/
|
||||
template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
|
||||
{
|
||||
eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
|
||||
@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector
|
||||
typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
|
||||
typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
|
||||
template<typename DiagonalType, typename SubDiagonalType>
|
||||
static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
|
||||
{
|
||||
CoeffVectorType hCoeffs(mat.cols()-1);
|
||||
tridiagonalization_inplace(mat,hCoeffs);
|
||||
@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
|
||||
template<typename DiagonalType, typename SubDiagonalType>
|
||||
static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
|
||||
{
|
||||
diag(0,0) = numext::real(mat(0,0));
|
||||
if(extractQ)
|
||||
|
@ -26,17 +26,17 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
|
||||
static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
|
||||
{
|
||||
Quaternion<float> res;
|
||||
const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
|
||||
__m128 a = _a.coeffs().template packet<AAlignment>(0);
|
||||
__m128 b = _b.coeffs().template packet<BAlignment>(0);
|
||||
__m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
|
||||
__m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
|
||||
const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
|
||||
Packet4f a = _a.coeffs().template packet<AAlignment>(0);
|
||||
Packet4f b = _b.coeffs().template packet<BAlignment>(0);
|
||||
Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
|
||||
Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
|
||||
pstoret<float,Packet4f,ResAlignment>(
|
||||
&res.x(),
|
||||
_mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
|
||||
_mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
|
||||
padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
|
||||
pmul(vec4f_swizzle1(a,2,0,1,0),
|
||||
vec4f_swizzle1(b,1,2,0,0))),
|
||||
_mm_xor_ps(mask,_mm_add_ps(s1,s2))));
|
||||
pxor(mask,padd(s1,s2))));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
|
||||
triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
|
||||
* vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
|
||||
|
||||
// FIXME add .noalias() once the triangular product can work inplace
|
||||
triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
|
||||
// FIXME use the following line with .noalias() once the triangular product can work inplace
|
||||
// triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
|
||||
for(Index j=nbVecs-1; j>i; --j)
|
||||
{
|
||||
typename TriangularFactorType::Scalar z = triFactor(i,j);
|
||||
triFactor(i,j) = z * triFactor(j,j);
|
||||
if(nbVecs-j-1>0)
|
||||
triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
|
||||
}
|
||||
|
||||
}
|
||||
triFactor(i,i) = hCoeffs(i);
|
||||
|
@ -39,6 +39,7 @@ template<int n> struct decrement_size
|
||||
* MatrixBase::applyHouseholderOnTheRight()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
|
||||
{
|
||||
VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
|
||||
@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void MatrixBase<Derived>::makeHouseholder(
|
||||
EssentialPart& essential,
|
||||
Scalar& tau,
|
||||
@ -110,6 +112,7 @@ void MatrixBase<Derived>::makeHouseholder(
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void MatrixBase<Derived>::applyHouseholderOnTheLeft(
|
||||
const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
@ -147,6 +150,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void MatrixBase<Derived>::applyHouseholderOnTheRight(
|
||||
const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
|
@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
|
||||
{
|
||||
typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
|
||||
typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
|
||||
static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
|
||||
static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
|
||||
{
|
||||
Index start = k+1+h.m_shift;
|
||||
return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
|
||||
@ -173,6 +173,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
*
|
||||
* \sa setLength(), setShift()
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
HouseholderSequence(const VectorsType& v, const CoeffsType& h)
|
||||
: m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
|
||||
m_shift(0)
|
||||
@ -180,6 +181,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
}
|
||||
|
||||
/** \brief Copy constructor. */
|
||||
EIGEN_DEVICE_FUNC
|
||||
HouseholderSequence(const HouseholderSequence& other)
|
||||
: m_vectors(other.m_vectors),
|
||||
m_coeffs(other.m_coeffs),
|
||||
@ -193,12 +195,14 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
* \returns Number of rows
|
||||
* \details This equals the dimension of the space that the transformation acts on.
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
|
||||
|
||||
/** \brief Number of columns of transformation viewed as a matrix.
|
||||
* \returns Number of columns
|
||||
* \details This equals the dimension of the space that the transformation acts on.
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
Index cols() const { return rows(); }
|
||||
|
||||
/** \brief Essential part of a Householder vector.
|
||||
@ -215,6 +219,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
*
|
||||
* \sa setShift(), shift()
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
const EssentialVectorType essentialVector(Index k) const
|
||||
{
|
||||
eigen_assert(k >= 0 && k < m_length);
|
||||
@ -252,7 +257,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
AdjointReturnType inverse() const { return adjoint(); }
|
||||
|
||||
/** \internal */
|
||||
template<typename DestType> inline void evalTo(DestType& dst) const
|
||||
template<typename DestType>
|
||||
inline EIGEN_DEVICE_FUNC
|
||||
void evalTo(DestType& dst) const
|
||||
{
|
||||
Matrix<Scalar, DestType::RowsAtCompileTime, 1,
|
||||
AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
|
||||
@ -261,6 +268,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
|
||||
/** \internal */
|
||||
template<typename Dest, typename Workspace>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void evalTo(Dest& dst, Workspace& workspace) const
|
||||
{
|
||||
workspace.resize(rows());
|
||||
@ -394,6 +402,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
*
|
||||
* \sa length()
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
HouseholderSequence& setLength(Index length)
|
||||
{
|
||||
m_length = length;
|
||||
@ -411,13 +420,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
*
|
||||
* \sa shift()
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
HouseholderSequence& setShift(Index shift)
|
||||
{
|
||||
m_shift = shift;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */
|
||||
|
||||
/* Necessary for .adjoint() and .conjugate() */
|
||||
|
@ -90,6 +90,7 @@ template<typename Scalar> class JacobiRotation
|
||||
* \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
|
||||
*/
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
|
||||
{
|
||||
using std::sqrt;
|
||||
@ -134,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
|
||||
*/
|
||||
template<typename Scalar>
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
|
||||
{
|
||||
return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
|
||||
@ -156,6 +158,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
|
||||
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
|
||||
*/
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
|
||||
{
|
||||
makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
|
||||
@ -164,6 +167,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
|
||||
|
||||
// specialization for complexes
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
|
||||
{
|
||||
using std::sqrt;
|
||||
@ -223,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
|
||||
|
||||
// specialization for reals
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
|
||||
{
|
||||
using std::sqrt;
|
||||
@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
|
||||
{
|
||||
RowXpr x(this->row(p));
|
||||
@ -301,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
|
||||
{
|
||||
ColXpr x(this->col(p));
|
||||
@ -314,7 +321,8 @@ template<typename Scalar, typename OtherScalar,
|
||||
int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
|
||||
struct apply_rotation_in_the_plane_selector
|
||||
{
|
||||
static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
|
||||
{
|
||||
for(Index i=0; i<size; ++i)
|
||||
{
|
||||
@ -441,6 +449,7 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime
|
||||
};
|
||||
|
||||
template<typename VectorX, typename VectorY, typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
|
||||
{
|
||||
typedef typename VectorX::Scalar Scalar;
|
||||
|
@ -15,6 +15,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const typename Derived::Scalar bruteforce_det3_helper
|
||||
(const MatrixBase<Derived>& matrix, int a, int b, int c)
|
||||
{
|
||||
@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename Derived::Scalar bruteforce_det4_helper
|
||||
(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
|
||||
{
|
||||
@ -44,7 +46,8 @@ template<typename Derived,
|
||||
|
||||
template<typename Derived> struct determinant_impl<Derived, 1>
|
||||
{
|
||||
static inline typename traits<Derived>::Scalar run(const Derived& m)
|
||||
static inline EIGEN_DEVICE_FUNC
|
||||
typename traits<Derived>::Scalar run(const Derived& m)
|
||||
{
|
||||
return m.coeff(0,0);
|
||||
}
|
||||
@ -52,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
|
||||
|
||||
template<typename Derived> struct determinant_impl<Derived, 2>
|
||||
{
|
||||
static inline typename traits<Derived>::Scalar run(const Derived& m)
|
||||
static inline EIGEN_DEVICE_FUNC
|
||||
typename traits<Derived>::Scalar run(const Derived& m)
|
||||
{
|
||||
return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
|
||||
}
|
||||
@ -60,7 +64,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
|
||||
|
||||
template<typename Derived> struct determinant_impl<Derived, 3>
|
||||
{
|
||||
static inline typename traits<Derived>::Scalar run(const Derived& m)
|
||||
static inline EIGEN_DEVICE_FUNC
|
||||
typename traits<Derived>::Scalar run(const Derived& m)
|
||||
{
|
||||
return bruteforce_det3_helper(m,0,1,2)
|
||||
- bruteforce_det3_helper(m,1,0,2)
|
||||
@ -70,7 +75,8 @@ template<typename Derived> struct determinant_impl<Derived, 3>
|
||||
|
||||
template<typename Derived> struct determinant_impl<Derived, 4>
|
||||
{
|
||||
static typename traits<Derived>::Scalar run(const Derived& m)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
typename traits<Derived>::Scalar run(const Derived& m)
|
||||
{
|
||||
// trick by Martin Costabel to compute 4x4 det with only 30 muls
|
||||
return bruteforce_det4_helper(m,0,1,2,3)
|
||||
@ -89,6 +95,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
|
||||
* \returns the determinant of this matrix
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
|
||||
{
|
||||
eigen_assert(rows() == cols());
|
||||
|
@ -290,6 +290,7 @@ template<typename DstXprType, typename XprType>
|
||||
struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
|
||||
{
|
||||
typedef Inverse<XprType> SrcXprType;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
@ -332,6 +333,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
|
||||
* \sa computeInverseAndDetWithCheck()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
|
||||
|
@ -291,7 +291,7 @@ template<typename MatrixQR, typename HCoeffs,
|
||||
bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
|
||||
struct householder_qr_inplace_blocked
|
||||
{
|
||||
// This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
|
||||
// This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
|
||||
static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
|
||||
typename MatrixQR::Scalar* tempData = 0)
|
||||
{
|
||||
|
@ -640,7 +640,8 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
|
||||
// Compute res = Q * other column by column
|
||||
for(Index j = 0; j < res.cols(); j++)
|
||||
{
|
||||
for (Index k = diagSize-1; k >=0; k--)
|
||||
Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
|
||||
for (Index k = start_k; k >=0; k--)
|
||||
{
|
||||
Scalar tau = Scalar(0);
|
||||
tau = m_qr.m_Q.col(k).dot(res.col(j));
|
||||
|
@ -1061,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
|
||||
/// \sa block(Index,Index,NRowsType,NColsType), class Block
|
||||
///
|
||||
template<int NRows, int NCols>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
|
||||
Index blockRows, Index blockCols)
|
||||
{
|
||||
|
@ -162,9 +162,11 @@ template<typename MatrixType> void block(const MatrixType& m)
|
||||
// expressions without direct access
|
||||
VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
|
||||
VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
|
||||
VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
|
||||
VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
|
||||
VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
|
||||
VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
|
||||
VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
|
||||
|
||||
VERIFY_IS_APPROX( (m1*1).topRows(r1), m1.topRows(r1) );
|
||||
VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) );
|
||||
|
@ -121,7 +121,7 @@ struct diagonal {
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct eigenvalues {
|
||||
struct eigenvalues_direct {
|
||||
EIGEN_DEVICE_FUNC
|
||||
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
|
||||
{
|
||||
@ -136,6 +136,34 @@ struct eigenvalues {
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct eigenvalues {
|
||||
EIGEN_DEVICE_FUNC
|
||||
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
|
||||
{
|
||||
using namespace Eigen;
|
||||
typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
|
||||
T M(in+i);
|
||||
Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
|
||||
T A = M*M.adjoint();
|
||||
SelfAdjointEigenSolver<T> eig;
|
||||
eig.compute(M);
|
||||
res = eig.eigenvalues();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct matrix_inverse {
|
||||
EIGEN_DEVICE_FUNC
|
||||
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
|
||||
{
|
||||
using namespace Eigen;
|
||||
T M(in+i);
|
||||
Map<T> res(out+i*T::MaxSizeAtCompileTime);
|
||||
res = M.inverse();
|
||||
}
|
||||
};
|
||||
|
||||
void test_cuda_basic()
|
||||
{
|
||||
ei_test_init_cuda();
|
||||
@ -164,7 +192,12 @@ void test_cuda_basic()
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix4f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix4f>(), nthreads, in, out) );
|
||||
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
|
||||
v2 = VectorType::Random(rows);
|
||||
RowVectorType rv1 = RowVectorType::Random(cols),
|
||||
rv2 = RowVectorType::Random(cols);
|
||||
|
||||
LeftDiagonalMatrix ldm1(v1), ldm2(v2);
|
||||
RightDiagonalMatrix rdm1(rv1), rdm2(rv2);
|
||||
|
||||
@ -107,6 +108,32 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
|
||||
VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
|
||||
}
|
||||
|
||||
template<typename MatrixType> void as_scalar_product(const MatrixType& m)
|
||||
{
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
|
||||
typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
|
||||
typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
|
||||
typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
|
||||
|
||||
Index rows = m.rows();
|
||||
Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
|
||||
|
||||
VectorType v1 = VectorType::Random(rows);
|
||||
DynVectorType dv1 = DynVectorType::Random(depth);
|
||||
DynRowVectorType drv1 = DynRowVectorType::Random(depth);
|
||||
DynMatrixType dm1 = dv1;
|
||||
DynMatrixType drm1 = drv1;
|
||||
|
||||
Scalar s = v1(0);
|
||||
|
||||
VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
|
||||
VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
|
||||
|
||||
VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
|
||||
VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
|
||||
}
|
||||
|
||||
template<int>
|
||||
void bug987()
|
||||
{
|
||||
@ -122,14 +149,19 @@ void test_diagonalmatrices()
|
||||
{
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) );
|
||||
CALL_SUBTEST_1( as_scalar_product(Matrix<float, 1, 1>()) );
|
||||
|
||||
CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) );
|
||||
CALL_SUBTEST_3( diagonalmatrices(Matrix<double,3,3,RowMajor>()) );
|
||||
CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) );
|
||||
CALL_SUBTEST_5( diagonalmatrices(Matrix<float,4,4,RowMajor>()) );
|
||||
CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) );
|
||||
CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) );
|
||||
CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) );
|
||||
}
|
||||
CALL_SUBTEST_10( bug987<0>() );
|
||||
}
|
||||
|
@ -257,13 +257,31 @@ void test_array()
|
||||
ss << a1;
|
||||
}
|
||||
|
||||
void test_product()
|
||||
{
|
||||
typedef Matrix<half,Dynamic,Dynamic> MatrixXh;
|
||||
Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
|
||||
Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
|
||||
Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
|
||||
MatrixXh Ah = MatrixXh::Random(rows,depth);
|
||||
MatrixXh Bh = MatrixXh::Random(depth,cols);
|
||||
MatrixXh Ch = MatrixXh::Random(rows,cols);
|
||||
MatrixXf Af = Ah.cast<float>();
|
||||
MatrixXf Bf = Bh.cast<float>();
|
||||
MatrixXf Cf = Ch.cast<float>();
|
||||
VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>());
|
||||
}
|
||||
|
||||
void test_half_float()
|
||||
{
|
||||
CALL_SUBTEST(test_conversion());
|
||||
CALL_SUBTEST(test_numtraits());
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
CALL_SUBTEST(test_conversion());
|
||||
CALL_SUBTEST(test_arithmetic());
|
||||
CALL_SUBTEST(test_comparison());
|
||||
CALL_SUBTEST(test_basic_functions());
|
||||
CALL_SUBTEST(test_trigonometric_functions());
|
||||
CALL_SUBTEST(test_array());
|
||||
CALL_SUBTEST(test_product());
|
||||
}
|
||||
}
|
||||
|
@ -14,9 +14,13 @@ using internal::is_same_dense;
|
||||
void test_is_same_dense()
|
||||
{
|
||||
typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
|
||||
typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd;
|
||||
ColMatrixXd m1(10,10);
|
||||
ColMatrixXcd m2(10,10);
|
||||
Ref<ColMatrixXd> ref_m1(m1);
|
||||
Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> > ref_m2_real(m2.real());
|
||||
Ref<const ColMatrixXd> const_ref_m1(m1);
|
||||
|
||||
VERIFY(is_same_dense(m1,m1));
|
||||
VERIFY(is_same_dense(m1,ref_m1));
|
||||
VERIFY(is_same_dense(const_ref_m1,m1));
|
||||
@ -30,4 +34,8 @@ void test_is_same_dense()
|
||||
|
||||
Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
|
||||
VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
|
||||
|
||||
|
||||
VERIFY(!is_same_dense(m1, ref_m2_real));
|
||||
VERIFY(!is_same_dense(m2, ref_m2_real));
|
||||
}
|
||||
|
@ -123,7 +123,7 @@ template<typename Scalar> void packetmath()
|
||||
EIGEN_ALIGN_MAX Scalar data2[size];
|
||||
EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
|
||||
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||
RealScalar refvalue = 0;
|
||||
RealScalar refvalue = RealScalar(0);
|
||||
for (int i=0; i<size; ++i)
|
||||
{
|
||||
data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
|
||||
@ -171,14 +171,18 @@ template<typename Scalar> void packetmath()
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[i] = data1[i+offset];
|
||||
|
||||
// palign is not used anymore, so let's just put a warning if it fails
|
||||
++g_test_level;
|
||||
VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
|
||||
--g_test_level;
|
||||
}
|
||||
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
|
||||
VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
|
||||
VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
|
||||
// Disabled as it is not clear why it would be mandatory to support division.
|
||||
//VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
|
||||
|
||||
CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd);
|
||||
CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub);
|
||||
@ -242,29 +246,30 @@ template<typename Scalar> void packetmath()
|
||||
}
|
||||
}
|
||||
|
||||
ref[0] = 0;
|
||||
ref[0] = Scalar(0);
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[0] += data1[i];
|
||||
VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
|
||||
|
||||
if(PacketSize==8 && internal::unpacket_traits<typename internal::unpacket_traits<Packet>::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case
|
||||
{
|
||||
int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize;
|
||||
for (int i=0; i<HalfPacketSize; ++i)
|
||||
ref[i] = 0;
|
||||
ref[i] = Scalar(0);
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[i%HalfPacketSize] += data1[i];
|
||||
internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
|
||||
VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
|
||||
}
|
||||
|
||||
ref[0] = 1;
|
||||
ref[0] = Scalar(1);
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[0] *= data1[i];
|
||||
VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
|
||||
|
||||
for (int j=0; j<PacketSize; ++j)
|
||||
{
|
||||
ref[j] = 0;
|
||||
ref[j] = Scalar(0);
|
||||
for (int i=0; i<PacketSize; ++i)
|
||||
ref[j] += data1[i+j*PacketSize];
|
||||
packets[j] = internal::pload<Packet>(data1+j*PacketSize);
|
||||
@ -630,6 +635,7 @@ void test_packetmath()
|
||||
CALL_SUBTEST_3( packetmath<int>() );
|
||||
CALL_SUBTEST_4( packetmath<std::complex<float> >() );
|
||||
CALL_SUBTEST_5( packetmath<std::complex<double> >() );
|
||||
CALL_SUBTEST_6( packetmath<half>() );
|
||||
|
||||
CALL_SUBTEST_1( packetmath_notcomplex<float>() );
|
||||
CALL_SUBTEST_2( packetmath_notcomplex<double>() );
|
||||
|
@ -111,6 +111,17 @@ template<typename MatrixType> void product(const MatrixType& m)
|
||||
vcres.noalias() -= m1.transpose() * v1;
|
||||
VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
|
||||
|
||||
// test scaled products
|
||||
res = square;
|
||||
res.noalias() = s1 * m1 * m2.transpose();
|
||||
VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose()));
|
||||
res = square;
|
||||
res.noalias() += s1 * m1 * m2.transpose();
|
||||
VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose()));
|
||||
res = square;
|
||||
res.noalias() -= s1 * m1 * m2.transpose();
|
||||
VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose()));
|
||||
|
||||
// test d ?= a+b*c rules
|
||||
res.noalias() = square + m1 * m2.transpose();
|
||||
VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
|
||||
|
@ -35,6 +35,8 @@ void test_product_large()
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
|
||||
|
||||
CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
|
||||
CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
|
||||
|
@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
|
||||
VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
|
||||
|
||||
// Check outer products
|
||||
#ifdef EIGEN_ALLOCA
|
||||
bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
|
||||
#else
|
||||
bool temp_via_alloca = false;
|
||||
#endif
|
||||
m3 = cv1 * rv1;
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
|
||||
rm3 = cv1 * rv1;
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
|
||||
|
@ -161,6 +161,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle special format like NCHW, its input shape is '[1, N..., 1]' and
|
||||
// broadcast shape is '[N, 1..., N]'
|
||||
if (!oneByN && !nByOne) {
|
||||
if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
|
||||
nByOne = true;
|
||||
oneByN = true;
|
||||
for (int i = 1; i < NumDims-1; ++i) {
|
||||
if (broadcast[i] != 1) {
|
||||
nByOne = false;
|
||||
oneByN = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
@ -256,24 +272,70 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
if (oneByN) {
|
||||
if (oneByN && !nByOne) {
|
||||
return packetNByOne<LoadMode>(index);
|
||||
} else if (nByOne) {
|
||||
} else if (!oneByN && nByOne) {
|
||||
return packetOneByN<LoadMode>(index);
|
||||
} else if (oneByN && nByOne) {
|
||||
return packetOneByNByOne<LoadMode>(index);
|
||||
} else {
|
||||
return packetColMajor<LoadMode>(index);
|
||||
}
|
||||
} else {
|
||||
if (oneByN) {
|
||||
if (oneByN && !nByOne) {
|
||||
return packetOneByN<LoadMode>(index);
|
||||
} else if (nByOne) {
|
||||
} else if (!oneByN && nByOne) {
|
||||
return packetNByOne<LoadMode>(index);
|
||||
} else if (oneByN && nByOne) {
|
||||
return packetOneByNByOne<LoadMode>(index);
|
||||
} else {
|
||||
return packetRowMajor<LoadMode>(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
|
||||
(Index index) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
Index startDim, endDim;
|
||||
Index inputIndex, outputOffset, batchedIndex;
|
||||
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
startDim = NumDims - 1;
|
||||
endDim = 1;
|
||||
} else {
|
||||
startDim = 0;
|
||||
endDim = NumDims - 2;
|
||||
}
|
||||
|
||||
batchedIndex = index % m_outputStrides[startDim];
|
||||
inputIndex = batchedIndex / m_outputStrides[endDim];
|
||||
outputOffset = batchedIndex % m_outputStrides[endDim];
|
||||
|
||||
if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
return internal::pload1<PacketReturnType>(values);
|
||||
} else {
|
||||
for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
|
||||
if (outputOffset + cur < m_outputStrides[endDim]) {
|
||||
values[i] = m_impl.coeff(inputIndex);
|
||||
} else {
|
||||
++inputIndex;
|
||||
inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
|
||||
values[i] = m_impl.coeff(inputIndex);
|
||||
outputOffset = 0;
|
||||
cur = 0;
|
||||
}
|
||||
}
|
||||
return internal::pload<PacketReturnType>(values);
|
||||
}
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
|
||||
{
|
||||
|
@ -189,9 +189,11 @@ struct ThreadPoolDevice {
|
||||
// of blocks to be evenly dividable across threads.
|
||||
|
||||
double block_size_f = 1.0 / CostModel::taskSize(1, cost);
|
||||
Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
|
||||
const Index max_block_size =
|
||||
numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
|
||||
const Index max_oversharding_factor = 4;
|
||||
Index block_size = numext::mini(
|
||||
n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
|
||||
block_size_f));
|
||||
const Index max_block_size = numext::mini(n, 2 * block_size);
|
||||
if (block_align) {
|
||||
Index new_block_size = block_align(block_size);
|
||||
eigen_assert(new_block_size >= block_size);
|
||||
@ -205,7 +207,8 @@ struct ThreadPoolDevice {
|
||||
(divup<int>(block_count, numThreads()) * numThreads());
|
||||
// Now try to increase block size up to max_block_size as long as it
|
||||
// doesn't decrease parallel efficiency.
|
||||
for (Index prev_block_count = block_count; prev_block_count > 1;) {
|
||||
for (Index prev_block_count = block_count;
|
||||
max_efficiency < 1.0 && prev_block_count > 1;) {
|
||||
// This is the next block size that divides size into a smaller number
|
||||
// of blocks than the current block_size.
|
||||
Index coarser_block_size = divup(n, prev_block_count - 1);
|
||||
|
@ -316,8 +316,8 @@ struct kissfft_impl
|
||||
|
||||
// use optimized mode for even real
|
||||
fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
|
||||
Complex dc = dst[0].real() + dst[0].imag();
|
||||
Complex nyquist = dst[0].real() - dst[0].imag();
|
||||
Complex dc(dst[0].real() + dst[0].imag());
|
||||
Complex nyquist(dst[0].real() - dst[0].imag());
|
||||
int k;
|
||||
for ( k=1;k <= ncfft2 ; ++k ) {
|
||||
Complex fpk = dst[k];
|
||||
|
@ -124,6 +124,7 @@ ei_add_test(polynomialsolver)
|
||||
ei_add_test(polynomialutils)
|
||||
ei_add_test(splines)
|
||||
ei_add_test(gmres)
|
||||
ei_add_test(dgmres)
|
||||
ei_add_test(minres)
|
||||
ei_add_test(levenberg_marquardt)
|
||||
ei_add_test(kronecker_product)
|
||||
|
@ -238,6 +238,59 @@ static void test_simple_broadcasting_n_by_one()
|
||||
}
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
static void test_simple_broadcasting_one_by_n_by_one_1d()
|
||||
{
|
||||
Tensor<float, 3, DataLayout> tensor(1,7,1);
|
||||
tensor.setRandom();
|
||||
array<ptrdiff_t, 3> broadcasts;
|
||||
broadcasts[0] = 5;
|
||||
broadcasts[1] = 1;
|
||||
broadcasts[2] = 13;
|
||||
Tensor<float, 3, DataLayout> broadcasted;
|
||||
broadcasted = tensor.broadcast(broadcasts);
|
||||
|
||||
VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
|
||||
VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
|
||||
VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
|
||||
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
for (int j = 0; j < 7; ++j) {
|
||||
for (int k = 0; k < 13; ++k) {
|
||||
VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
static void test_simple_broadcasting_one_by_n_by_one_2d()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(1,7,13,1);
|
||||
tensor.setRandom();
|
||||
array<ptrdiff_t, 4> broadcasts;
|
||||
broadcasts[0] = 5;
|
||||
broadcasts[1] = 1;
|
||||
broadcasts[2] = 1;
|
||||
broadcasts[3] = 19;
|
||||
Tensor<float, 4, DataLayout> broadcast;
|
||||
broadcast = tensor.broadcast(broadcasts);
|
||||
|
||||
VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
|
||||
VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
|
||||
VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
|
||||
VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
|
||||
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
for (int j = 0; j < 7; ++j) {
|
||||
for (int k = 0; k < 13; ++k) {
|
||||
for (int l = 0; l < 19; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_broadcasting()
|
||||
{
|
||||
@ -253,4 +306,8 @@ void test_cxx11_tensor_broadcasting()
|
||||
CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
|
||||
CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user