mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 10:09:36 +08:00
Pulled latest updates from upstream
This commit is contained in:
commit
07a247dcf4
@ -221,10 +221,17 @@ if(NOT MSVC)
|
||||
message(STATUS "Enabling FMA in tests/examples")
|
||||
endif()
|
||||
|
||||
<<<<<<< local
|
||||
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
|
||||
if(EIGEN_TEST_AVX512)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq")
|
||||
message(STATUS "Enabling AVX512 in tests/examples")
|
||||
=======
|
||||
option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
|
||||
if(EIGEN_TEST_F16C)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
|
||||
message(STATUS "Enabling F16C in tests/examples")
|
||||
>>>>>>> other
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
|
||||
|
17
Eigen/Core
17
Eigen/Core
@ -33,13 +33,13 @@
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
#undef EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
|
||||
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||
@ -212,7 +212,7 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
|
||||
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
||||
#define EIGEN_HAS_FP16_C
|
||||
#endif
|
||||
@ -222,10 +222,14 @@
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_HAS_CUDA_FP16
|
||||
#include <host_defines.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
#define EIGEN_HAS_OPENMP
|
||||
#endif
|
||||
@ -306,7 +310,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
|
||||
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
|
||||
// ensure QNX/QCC support
|
||||
using std::size_t;
|
||||
// gcc 4.6.0 wants std:: for ptrdiff_t
|
||||
// gcc 4.6.0 wants std:: for ptrdiff_t
|
||||
using std::ptrdiff_t;
|
||||
|
||||
/** \defgroup Core_Module Core module
|
||||
@ -455,6 +459,7 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/products/TriangularSolverVector.h"
|
||||
#include "src/Core/BandMatrix.h"
|
||||
#include "src/Core/CoreIterators.h"
|
||||
#include "src/Core/ConditionEstimator.h"
|
||||
|
||||
#include "src/Core/BooleanRedux.h"
|
||||
#include "src/Core/Select.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#ifndef EIGEN_LDLT_H
|
||||
#define EIGEN_LDLT_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template<typename MatrixType, int UpLo> struct LDLT_Traits;
|
||||
@ -73,11 +73,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
* The default constructor is useful in cases in which the user intends to
|
||||
* perform decompositions via LDLT::compute(const MatrixType&).
|
||||
*/
|
||||
LDLT()
|
||||
: m_matrix(),
|
||||
m_transpositions(),
|
||||
LDLT()
|
||||
: m_matrix(),
|
||||
m_transpositions(),
|
||||
m_sign(internal::ZeroSign),
|
||||
m_isInitialized(false)
|
||||
m_isInitialized(false)
|
||||
{}
|
||||
|
||||
/** \brief Default Constructor with memory preallocation
|
||||
@ -168,7 +168,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
* \note_about_checking_solutions
|
||||
*
|
||||
* More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
|
||||
* by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
|
||||
* by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
|
||||
* \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
|
||||
* \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
|
||||
* least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
|
||||
@ -192,6 +192,15 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
template<typename InputType>
|
||||
LDLT& compute(const EigenBase<InputType>& matrix);
|
||||
|
||||
/** \returns an estimate of the reciprocal condition number of the matrix of
|
||||
* which \c *this is the LDLT decomposition.
|
||||
*/
|
||||
RealScalar rcond() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LDLT is not initialized.");
|
||||
return internal::rcond_estimate_helper(m_l1_norm, *this);
|
||||
}
|
||||
|
||||
template <typename Derived>
|
||||
LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
|
||||
|
||||
@ -207,6 +216,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
|
||||
MatrixType reconstructedMatrix() const;
|
||||
|
||||
/** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
|
||||
*
|
||||
* This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
|
||||
* \code x = decomposition.adjoint().solve(b) \endcode
|
||||
*/
|
||||
const LDLT& adjoint() const { return *this; };
|
||||
|
||||
inline Index rows() const { return m_matrix.rows(); }
|
||||
inline Index cols() const { return m_matrix.cols(); }
|
||||
|
||||
@ -220,7 +236,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
eigen_assert(m_isInitialized && "LDLT is not initialized.");
|
||||
return Success;
|
||||
}
|
||||
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename RhsType, typename DstType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -228,7 +244,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
#endif
|
||||
|
||||
protected:
|
||||
|
||||
|
||||
static void check_template_parameters()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
|
||||
@ -241,6 +257,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
* is not stored), and the diagonal entries correspond to D.
|
||||
*/
|
||||
MatrixType m_matrix;
|
||||
RealScalar m_l1_norm;
|
||||
TranspositionType m_transpositions;
|
||||
TmpMatrixType m_temporary;
|
||||
internal::SignMatrix m_sign;
|
||||
@ -314,7 +331,7 @@ template<> struct ldlt_inplace<Lower>
|
||||
if(rs>0)
|
||||
A21.noalias() -= A20 * temp.head(k);
|
||||
}
|
||||
|
||||
|
||||
// In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
|
||||
// was smaller than the cutoff value. However, since LDLT is not rank-revealing
|
||||
// we should only make sure that we do not introduce INF or NaN values.
|
||||
@ -433,12 +450,25 @@ template<typename InputType>
|
||||
LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
|
||||
{
|
||||
check_template_parameters();
|
||||
|
||||
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
|
||||
m_matrix = a.derived();
|
||||
|
||||
// Compute matrix L1 norm = max abs column sum.
|
||||
m_l1_norm = RealScalar(0);
|
||||
// TODO move this code to SelfAdjointView
|
||||
for (Index col = 0; col < size; ++col) {
|
||||
RealScalar abs_col_sum;
|
||||
if (_UpLo == Lower)
|
||||
abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
|
||||
else
|
||||
abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
|
||||
if (abs_col_sum > m_l1_norm)
|
||||
m_l1_norm = abs_col_sum;
|
||||
}
|
||||
|
||||
m_transpositions.resize(size);
|
||||
m_isInitialized = false;
|
||||
m_temporary.resize(size);
|
||||
@ -466,7 +496,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
|
||||
eigen_assert(m_matrix.rows()==size);
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
m_matrix.resize(size,size);
|
||||
m_matrix.setZero();
|
||||
m_transpositions.resize(size);
|
||||
@ -505,7 +535,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
|
||||
// diagonal element is not well justified and leads to numerical issues in some cases.
|
||||
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
|
||||
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
|
||||
|
||||
|
||||
for (Index i = 0; i < vecD.size(); ++i)
|
||||
{
|
||||
if(abs(vecD(i)) > tolerance)
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_LLT_H
|
||||
#define EIGEN_LLT_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal{
|
||||
template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
@ -40,7 +40,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
*
|
||||
* Example: \include LLT_example.cpp
|
||||
* Output: \verbinclude LLT_example.out
|
||||
*
|
||||
*
|
||||
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||
*/
|
||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
||||
@ -135,6 +135,16 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
template<typename InputType>
|
||||
LLT& compute(const EigenBase<InputType>& matrix);
|
||||
|
||||
/** \returns an estimate of the reciprocal condition number of the matrix of
|
||||
* which \c *this is the Cholesky decomposition.
|
||||
*/
|
||||
RealScalar rcond() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
||||
eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
|
||||
return internal::rcond_estimate_helper(m_l1_norm, *this);
|
||||
}
|
||||
|
||||
/** \returns the LLT decomposition matrix
|
||||
*
|
||||
* TODO: document the storage layout
|
||||
@ -159,12 +169,19 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
return m_info;
|
||||
}
|
||||
|
||||
/** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
|
||||
*
|
||||
* This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
|
||||
* \code x = decomposition.adjoint().solve(b) \endcode
|
||||
*/
|
||||
const LLT& adjoint() const { return *this; };
|
||||
|
||||
inline Index rows() const { return m_matrix.rows(); }
|
||||
inline Index cols() const { return m_matrix.cols(); }
|
||||
|
||||
template<typename VectorType>
|
||||
LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
|
||||
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename RhsType, typename DstType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -172,17 +189,18 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
#endif
|
||||
|
||||
protected:
|
||||
|
||||
|
||||
static void check_template_parameters()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
|
||||
}
|
||||
|
||||
|
||||
/** \internal
|
||||
* Used to compute and store L
|
||||
* The strict upper part is not used and even not initialized.
|
||||
*/
|
||||
MatrixType m_matrix;
|
||||
RealScalar m_l1_norm;
|
||||
bool m_isInitialized;
|
||||
ComputationInfo m_info;
|
||||
};
|
||||
@ -268,7 +286,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
|
||||
static Index unblocked(MatrixType& mat)
|
||||
{
|
||||
using std::sqrt;
|
||||
|
||||
|
||||
eigen_assert(mat.rows()==mat.cols());
|
||||
const Index size = mat.rows();
|
||||
for(Index k = 0; k < size; ++k)
|
||||
@ -328,7 +346,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
|
||||
return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Scalar> struct llt_inplace<Scalar, Upper>
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
@ -387,12 +405,25 @@ template<typename InputType>
|
||||
LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
|
||||
{
|
||||
check_template_parameters();
|
||||
|
||||
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
m_matrix.resize(size, size);
|
||||
m_matrix = a.derived();
|
||||
|
||||
// Compute matrix L1 norm = max abs column sum.
|
||||
m_l1_norm = RealScalar(0);
|
||||
// TODO move this code to SelfAdjointView
|
||||
for (Index col = 0; col < size; ++col) {
|
||||
RealScalar abs_col_sum;
|
||||
if (_UpLo == Lower)
|
||||
abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
|
||||
else
|
||||
abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
|
||||
if (abs_col_sum > m_l1_norm)
|
||||
m_l1_norm = abs_col_sum;
|
||||
}
|
||||
|
||||
m_isInitialized = true;
|
||||
bool ok = Traits::inplace_decomposition(m_matrix);
|
||||
m_info = ok ? Success : NumericalIssue;
|
||||
@ -419,7 +450,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename _MatrixType,int _UpLo>
|
||||
template<typename RhsType, typename DstType>
|
||||
@ -431,7 +462,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
#endif
|
||||
|
||||
/** \internal use x = llt_object.solve(x);
|
||||
*
|
||||
*
|
||||
* This is the \em in-place version of solve().
|
||||
*
|
||||
* \param bAndX represents both the right-hand side matrix b and result x.
|
||||
@ -483,7 +514,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
|
||||
return LLT<PlainObject,UpLo>(m_matrix);
|
||||
}
|
||||
#endif // __CUDACC__
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_LLT_H
|
||||
|
@ -29,13 +29,10 @@ struct copy_using_evaluator_traits
|
||||
{
|
||||
typedef typename DstEvaluator::XprType Dst;
|
||||
typedef typename Dst::Scalar DstScalar;
|
||||
// TODO distinguish between linear traversal and inner-traversals
|
||||
typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType;
|
||||
|
||||
enum {
|
||||
DstFlags = DstEvaluator::Flags,
|
||||
SrcFlags = SrcEvaluator::Flags,
|
||||
RequiredAlignment = unpacket_traits<PacketType>::alignment
|
||||
SrcFlags = SrcEvaluator::Flags
|
||||
};
|
||||
|
||||
public:
|
||||
@ -55,10 +52,25 @@ private:
|
||||
: int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
|
||||
: int(Dst::MaxRowsAtCompileTime),
|
||||
OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
|
||||
MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
|
||||
PacketSize = unpacket_traits<PacketType>::size
|
||||
MaxSizeAtCompileTime = Dst::SizeAtCompileTime
|
||||
};
|
||||
|
||||
// TODO distinguish between linear traversal and inner-traversals
|
||||
typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
|
||||
typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
|
||||
|
||||
enum {
|
||||
LinearPacketSize = unpacket_traits<LinearPacketType>::size,
|
||||
InnerPacketSize = unpacket_traits<InnerPacketType>::size
|
||||
};
|
||||
|
||||
public:
|
||||
enum {
|
||||
LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
|
||||
InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
|
||||
};
|
||||
|
||||
private:
|
||||
enum {
|
||||
DstIsRowMajor = DstFlags&RowMajorBit,
|
||||
SrcIsRowMajor = SrcFlags&RowMajorBit,
|
||||
@ -67,16 +79,16 @@ private:
|
||||
&& (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
|
||||
&& (functor_traits<AssignFunc>::PacketAccess),
|
||||
MayInnerVectorize = MightVectorize
|
||||
&& int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
|
||||
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0
|
||||
&& int(JointAlignment)>=int(RequiredAlignment),
|
||||
&& int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
|
||||
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
|
||||
&& int(JointAlignment)>=int(InnerRequiredAlignment),
|
||||
MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
|
||||
MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
|
||||
&& ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
|
||||
&& ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
|
||||
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
|
||||
so it's only good for large enough sizes. */
|
||||
MaySliceVectorize = MightVectorize && DstHasDirectAccess
|
||||
&& (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
|
||||
&& (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize)
|
||||
/* slice vectorization can be slow, so we only want it if the slices are big, which is
|
||||
indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
|
||||
in a fixed-size matrix */
|
||||
@ -84,7 +96,8 @@ private:
|
||||
|
||||
public:
|
||||
enum {
|
||||
Traversal = int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
|
||||
Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
|
||||
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
|
||||
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
|
||||
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
|
||||
: int(MayLinearize) ? int(LinearTraversal)
|
||||
@ -94,9 +107,14 @@ public:
|
||||
|| int(Traversal) == SliceVectorizedTraversal
|
||||
};
|
||||
|
||||
typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
|
||||
|
||||
private:
|
||||
enum {
|
||||
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
|
||||
ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
|
||||
: Vectorized ? InnerPacketSize
|
||||
: 1,
|
||||
UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
|
||||
MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
|
||||
&& int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
|
||||
MayUnrollInner = int(InnerSize) != Dynamic
|
||||
@ -112,7 +130,7 @@ public:
|
||||
: int(NoUnrolling)
|
||||
)
|
||||
: int(Traversal) == int(LinearVectorizedTraversal)
|
||||
? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
|
||||
? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling)
|
||||
: int(NoUnrolling) )
|
||||
: int(Traversal) == int(LinearTraversal)
|
||||
? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
|
||||
@ -131,11 +149,13 @@ public:
|
||||
std::cerr.unsetf(std::ios::hex);
|
||||
EIGEN_DEBUG_VAR(DstAlignment)
|
||||
EIGEN_DEBUG_VAR(SrcAlignment)
|
||||
EIGEN_DEBUG_VAR(RequiredAlignment)
|
||||
EIGEN_DEBUG_VAR(LinearRequiredAlignment)
|
||||
EIGEN_DEBUG_VAR(InnerRequiredAlignment)
|
||||
EIGEN_DEBUG_VAR(JointAlignment)
|
||||
EIGEN_DEBUG_VAR(InnerSize)
|
||||
EIGEN_DEBUG_VAR(InnerMaxSize)
|
||||
EIGEN_DEBUG_VAR(PacketSize)
|
||||
EIGEN_DEBUG_VAR(LinearPacketSize)
|
||||
EIGEN_DEBUG_VAR(InnerPacketSize)
|
||||
EIGEN_DEBUG_VAR(StorageOrdersAgree)
|
||||
EIGEN_DEBUG_VAR(MightVectorize)
|
||||
EIGEN_DEBUG_VAR(MayLinearize)
|
||||
@ -370,7 +390,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
|
||||
typedef typename Kernel::Scalar Scalar;
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
enum {
|
||||
requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
|
||||
requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
|
||||
packetSize = unpacket_traits<PacketType>::size,
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
|
||||
dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
|
||||
@ -484,7 +504,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
|
||||
typedef typename Kernel::PacketType PacketType;
|
||||
enum {
|
||||
packetSize = unpacket_traits<PacketType>::size,
|
||||
requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
|
||||
requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
|
||||
alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
|
||||
dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
|
||||
dstAlignment = alignable ? int(requestedAlignment)
|
||||
|
166
Eigen/src/Core/ConditionEstimator.h
Normal file
166
Eigen/src/Core/ConditionEstimator.h
Normal file
@ -0,0 +1,166 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CONDITIONESTIMATOR_H
|
||||
#define EIGEN_CONDITIONESTIMATOR_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename Vector, typename RealVector, bool IsComplex>
|
||||
struct rcond_compute_sign {
|
||||
static inline Vector run(const Vector& v) {
|
||||
const RealVector v_abs = v.cwiseAbs();
|
||||
return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
|
||||
.select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
|
||||
}
|
||||
};
|
||||
|
||||
// Partial specialization to avoid elementwise division for real vectors.
|
||||
template <typename Vector>
|
||||
struct rcond_compute_sign<Vector, Vector, false> {
|
||||
static inline Vector run(const Vector& v) {
|
||||
return (v.array() < static_cast<typename Vector::RealScalar>(0))
|
||||
.select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief Reciprocal condition number estimator.
|
||||
*
|
||||
* Computing a decomposition of a dense matrix takes O(n^3) operations, while
|
||||
* this method estimates the condition number quickly and reliably in O(n^2)
|
||||
* operations.
|
||||
*
|
||||
* \returns an estimate of the reciprocal condition number
|
||||
* (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
|
||||
* its decomposition. Supports the following decompositions: FullPivLU,
|
||||
* PartialPivLU, LDLT, and LLT.
|
||||
*
|
||||
* \sa FullPivLU, PartialPivLU, LDLT, LLT.
|
||||
*/
|
||||
template <typename Decomposition>
|
||||
typename Decomposition::RealScalar
|
||||
rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
|
||||
{
|
||||
typedef typename Decomposition::RealScalar RealScalar;
|
||||
eigen_assert(dec.rows() == dec.cols());
|
||||
if (dec.rows() == 0) return RealScalar(1);
|
||||
if (matrix_norm == RealScalar(0)) return RealScalar(0);
|
||||
if (dec.rows() == 1) return RealScalar(1);
|
||||
const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
|
||||
return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
|
||||
: (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
|
||||
}
|
||||
|
||||
/**
|
||||
* \returns an estimate of ||inv(matrix)||_1 given a decomposition of
|
||||
* \a matrix that implements .solve() and .adjoint().solve() methods.
|
||||
*
|
||||
* This function implements Algorithms 4.1 and 5.1 from
|
||||
* http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
|
||||
* which also forms the basis for the condition number estimators in
|
||||
* LAPACK. Since at most 10 calls to the solve method of dec are
|
||||
* performed, the total cost is O(dims^2), as opposed to O(dims^3)
|
||||
* needed to compute the inverse matrix explicitly.
|
||||
*
|
||||
* The most common usage is in estimating the condition number
|
||||
* ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
|
||||
* computed directly in O(n^2) operations.
|
||||
*
|
||||
* Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
|
||||
* LLT.
|
||||
*
|
||||
* \sa FullPivLU, PartialPivLU, LDLT, LLT.
|
||||
*/
|
||||
template <typename Decomposition>
|
||||
typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
|
||||
{
|
||||
typedef typename Decomposition::MatrixType MatrixType;
|
||||
typedef typename Decomposition::Scalar Scalar;
|
||||
typedef typename Decomposition::RealScalar RealScalar;
|
||||
typedef typename internal::plain_col_type<MatrixType>::type Vector;
|
||||
typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
|
||||
const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
|
||||
|
||||
eigen_assert(dec.rows() == dec.cols());
|
||||
const Index n = dec.rows();
|
||||
if (n == 0)
|
||||
return 0;
|
||||
|
||||
Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
|
||||
|
||||
// lower_bound is a lower bound on
|
||||
// ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1
|
||||
// and is the objective maximized by the ("super-") gradient ascent
|
||||
// algorithm below.
|
||||
RealScalar lower_bound = v.template lpNorm<1>();
|
||||
if (n == 1)
|
||||
return lower_bound;
|
||||
|
||||
// Gradient ascent algorithm follows: We know that the optimum is achieved at
|
||||
// one of the simplices v = e_i, so in each iteration we follow a
|
||||
// super-gradient to move towards the optimal one.
|
||||
RealScalar old_lower_bound = lower_bound;
|
||||
Vector sign_vector(n);
|
||||
Vector old_sign_vector;
|
||||
Index v_max_abs_index = -1;
|
||||
Index old_v_max_abs_index = v_max_abs_index;
|
||||
for (int k = 0; k < 4; ++k)
|
||||
{
|
||||
sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
|
||||
if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
|
||||
// Break if the solution stagnated.
|
||||
break;
|
||||
}
|
||||
// v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
|
||||
v = dec.adjoint().solve(sign_vector);
|
||||
v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
|
||||
if (v_max_abs_index == old_v_max_abs_index) {
|
||||
// Break if the solution stagnated.
|
||||
break;
|
||||
}
|
||||
// Move to the new simplex e_j, where j = v_max_abs_index.
|
||||
v = dec.solve(Vector::Unit(n, v_max_abs_index)); // v = inv(matrix) * e_j.
|
||||
lower_bound = v.template lpNorm<1>();
|
||||
if (lower_bound <= old_lower_bound) {
|
||||
// Break if the gradient step did not increase the lower_bound.
|
||||
break;
|
||||
}
|
||||
if (!is_complex) {
|
||||
old_sign_vector = sign_vector;
|
||||
}
|
||||
old_v_max_abs_index = v_max_abs_index;
|
||||
old_lower_bound = lower_bound;
|
||||
}
|
||||
// The following calculates an independent estimate of ||matrix||_1 by
|
||||
// multiplying matrix by a vector with entries of slowly increasing
|
||||
// magnitude and alternating sign:
|
||||
// v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
|
||||
// This improvement to Hager's algorithm above is due to Higham. It was
|
||||
// added to make the algorithm more robust in certain corner cases where
|
||||
// large elements in the matrix might otherwise escape detection due to
|
||||
// exact cancellation (especially when op and op_adjoint correspond to a
|
||||
// sequence of backsubstitutions and permutations), which could cause
|
||||
// Hager's algorithm to vastly underestimate ||matrix||_1.
|
||||
Scalar alternating_sign(RealScalar(1));
|
||||
for (Index i = 0; i < n; ++i) {
|
||||
v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
|
||||
alternating_sign = -alternating_sign;
|
||||
}
|
||||
v = dec.solve(v);
|
||||
const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
|
||||
return numext::maxi(lower_bound, alternate_lower_bound);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif
|
@ -81,6 +81,8 @@ public:
|
||||
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
|
||||
// FIXME I'm not sure the current mapping is the ideal one.
|
||||
template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; };
|
||||
template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; };
|
||||
template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; };
|
||||
template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
|
||||
template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; };
|
||||
template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; };
|
||||
|
@ -1023,21 +1023,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double log(const double &x) { return ::log(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T tan(const T &x) {
|
||||
EIGEN_USING_STD_MATH(tan);
|
||||
return tan(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double tan(const double &x) { return ::tan(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename NumTraits<T>::Real abs(const T &x) {
|
||||
@ -1068,6 +1053,141 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double exp(const double &x) { return ::exp(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T cos(const T &x) {
|
||||
EIGEN_USING_STD_MATH(cos);
|
||||
return cos(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cos(const float &x) { return ::cosf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double cos(const double &x) { return ::cos(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T sin(const T &x) {
|
||||
EIGEN_USING_STD_MATH(sin);
|
||||
return sin(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sin(const float &x) { return ::sinf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double sin(const double &x) { return ::sin(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T tan(const T &x) {
|
||||
EIGEN_USING_STD_MATH(tan);
|
||||
return tan(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double tan(const double &x) { return ::tan(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T acos(const T &x) {
|
||||
EIGEN_USING_STD_MATH(acos);
|
||||
return acos(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float acos(const float &x) { return ::acosf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double acos(const double &x) { return ::acos(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T asin(const T &x) {
|
||||
EIGEN_USING_STD_MATH(asin);
|
||||
return asin(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float asin(const float &x) { return ::asinf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double asin(const double &x) { return ::asin(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T atan(const T &x) {
|
||||
EIGEN_USING_STD_MATH(atan);
|
||||
return atan(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float atan(const float &x) { return ::atanf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double atan(const double &x) { return ::atan(x); }
|
||||
#endif
|
||||
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T cosh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(cosh);
|
||||
return cosh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cosh(const float &x) { return ::coshf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double cosh(const double &x) { return ::cosh(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T sinh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(sinh);
|
||||
return sinh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sinh(const float &x) { return ::sinhf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double sinh(const double &x) { return ::sinh(x); }
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T tanh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(tanh);
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(const float &x) { return ::tanhf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double tanh(const double &x) { return ::tanh(x); }
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
|
@ -410,8 +410,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
typedef Product<Lhs, Rhs, LazyProduct> XprType;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
explicit product_evaluator(const XprType& xpr)
|
||||
@ -437,16 +435,20 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
|
||||
typedef evaluator<LhsNestedCleaned> LhsEtorType;
|
||||
typedef evaluator<RhsNestedCleaned> RhsEtorType;
|
||||
|
||||
|
||||
enum {
|
||||
RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
|
||||
ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
|
||||
InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
|
||||
MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
|
||||
|
||||
PacketSize = packet_traits<Scalar>::size,
|
||||
MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
|
||||
};
|
||||
|
||||
typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
|
||||
typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
|
||||
|
||||
enum {
|
||||
|
||||
LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
|
||||
RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
|
||||
CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
|
||||
@ -459,19 +461,23 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
LhsFlags = LhsEtorType::Flags,
|
||||
RhsFlags = RhsEtorType::Flags,
|
||||
|
||||
LhsAlignment = LhsEtorType::Alignment,
|
||||
RhsAlignment = RhsEtorType::Alignment,
|
||||
|
||||
LhsRowMajor = LhsFlags & RowMajorBit,
|
||||
RhsRowMajor = RhsFlags & RowMajorBit,
|
||||
|
||||
LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
|
||||
RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
|
||||
|
||||
// Here, we don't care about alignment larger than the usable packet size.
|
||||
LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
|
||||
RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
|
||||
|
||||
SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
|
||||
|
||||
CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
|
||||
&& (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ),
|
||||
&& (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ),
|
||||
|
||||
CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
|
||||
&& (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ),
|
||||
&& (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ),
|
||||
|
||||
EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
|
||||
: (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
|
||||
@ -491,10 +497,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
|
||||
: 0,
|
||||
|
||||
/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
|
||||
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
|
||||
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
|
||||
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
|
||||
*/
|
||||
* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
|
||||
* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
|
||||
* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
|
||||
*/
|
||||
CanVectorizeInner = SameType
|
||||
&& LhsRowMajor
|
||||
&& (!RhsRowMajor)
|
||||
@ -1000,7 +1006,7 @@ struct transposition_matrix_product
|
||||
const Index size = tr.size();
|
||||
StorageIndex j = 0;
|
||||
|
||||
if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat)))
|
||||
if(!is_same_dense(dst,mat))
|
||||
dst = mat;
|
||||
|
||||
for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
|
||||
|
@ -27,8 +27,9 @@ template<typename Func, typename Derived>
|
||||
struct redux_traits
|
||||
{
|
||||
public:
|
||||
typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
|
||||
enum {
|
||||
PacketSize = packet_traits<typename Derived::Scalar>::size,
|
||||
PacketSize = unpacket_traits<PacketType>::size,
|
||||
InnerMaxSize = int(Derived::IsRowMajor)
|
||||
? Derived::MaxColsAtCompileTime
|
||||
: Derived::MaxRowsAtCompileTime
|
||||
@ -137,12 +138,12 @@ template<typename Func, typename Derived, int Start, int Length>
|
||||
struct redux_vec_unroller
|
||||
{
|
||||
enum {
|
||||
PacketSize = packet_traits<typename Derived::Scalar>::size,
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
HalfLength = Length/2
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename packet_traits<Scalar>::type PacketScalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
|
||||
{
|
||||
@ -156,14 +157,14 @@ template<typename Func, typename Derived, int Start>
|
||||
struct redux_vec_unroller<Func, Derived, Start, 1>
|
||||
{
|
||||
enum {
|
||||
index = Start * packet_traits<typename Derived::Scalar>::size,
|
||||
index = Start * redux_traits<Func, Derived>::PacketSize,
|
||||
outer = index / int(Derived::InnerSizeAtCompileTime),
|
||||
inner = index % int(Derived::InnerSizeAtCompileTime),
|
||||
alignment = Derived::Alignment
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename packet_traits<Scalar>::type PacketScalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
|
||||
{
|
||||
@ -209,13 +210,13 @@ template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename packet_traits<Scalar>::type PacketScalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
|
||||
static Scalar run(const Derived &mat, const Func& func)
|
||||
{
|
||||
const Index size = mat.size();
|
||||
|
||||
const Index packetSize = packet_traits<Scalar>::size;
|
||||
const Index packetSize = redux_traits<Func, Derived>::PacketSize;
|
||||
const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
|
||||
enum {
|
||||
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
|
||||
@ -268,7 +269,7 @@ template<typename Func, typename Derived, int Unrolling>
|
||||
struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename packet_traits<Scalar>::type PacketType;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
|
||||
{
|
||||
@ -276,7 +277,7 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
|
||||
const Index innerSize = mat.innerSize();
|
||||
const Index outerSize = mat.outerSize();
|
||||
enum {
|
||||
packetSize = packet_traits<Scalar>::size
|
||||
packetSize = redux_traits<Func, Derived>::PacketSize
|
||||
};
|
||||
const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
|
||||
Scalar res;
|
||||
@ -306,9 +307,10 @@ template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename packet_traits<Scalar>::type PacketScalar;
|
||||
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
enum {
|
||||
PacketSize = packet_traits<Scalar>::size,
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
Size = Derived::SizeAtCompileTime,
|
||||
VectorizedSize = (Size / PacketSize) * PacketSize
|
||||
};
|
||||
@ -367,11 +369,11 @@ public:
|
||||
{ return m_evaluator.coeff(index); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketReturnType packet(Index row, Index col) const
|
||||
PacketType packet(Index row, Index col) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketReturnType packet(Index index) const
|
||||
PacketType packet(Index index) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -379,7 +381,7 @@ public:
|
||||
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketReturnType packetByOuterInner(Index outer, Index inner) const
|
||||
PacketType packetByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
const XprType & nestedExpression() const { return m_xpr; }
|
||||
|
@ -213,7 +213,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
|
||||
|
||||
template<typename Dest> inline void evalTo(Dest& dst) const
|
||||
{
|
||||
if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
|
||||
if(!is_same_dense(dst,m_rhs))
|
||||
dst = m_rhs;
|
||||
m_triangularMatrix.template solveInPlace<Side>(dst);
|
||||
}
|
||||
|
@ -281,20 +281,18 @@ struct digamma_impl {
|
||||
*/
|
||||
|
||||
Scalar p, q, nz, s, w, y;
|
||||
bool negative;
|
||||
bool negative = false;
|
||||
|
||||
const Scalar maxnum = NumTraits<Scalar>::infinity();
|
||||
const Scalar m_pi = EIGEN_PI;
|
||||
const Scalar m_pi(EIGEN_PI);
|
||||
|
||||
negative = 0;
|
||||
nz = 0.0;
|
||||
|
||||
const Scalar zero = 0.0;
|
||||
const Scalar one = 1.0;
|
||||
const Scalar half = 0.5;
|
||||
const Scalar zero = Scalar(0);
|
||||
const Scalar one = Scalar(1);
|
||||
const Scalar half = Scalar(0.5);
|
||||
nz = zero;
|
||||
|
||||
if (x <= zero) {
|
||||
negative = one;
|
||||
negative = true;
|
||||
q = x;
|
||||
p = numext::floor(q);
|
||||
if (p == q) {
|
||||
@ -463,7 +461,7 @@ template <typename Scalar>
|
||||
struct igammac_impl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static Scalar run(Scalar a, Scalar x) {
|
||||
/* igamc()
|
||||
/* igamc()
|
||||
*
|
||||
* Incomplete gamma integral (modified for Eigen)
|
||||
*
|
||||
@ -519,26 +517,51 @@ struct igammac_impl {
|
||||
*/
|
||||
const Scalar zero = 0;
|
||||
const Scalar one = 1;
|
||||
const Scalar two = 2;
|
||||
const Scalar machep = igamma_helper<Scalar>::machep();
|
||||
const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
|
||||
const Scalar big = igamma_helper<Scalar>::big();
|
||||
const Scalar biginv = 1 / big;
|
||||
const Scalar nan = NumTraits<Scalar>::quiet_NaN();
|
||||
const Scalar inf = NumTraits<Scalar>::infinity();
|
||||
|
||||
Scalar ans, ax, c, yc, r, t, y, z;
|
||||
Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
|
||||
|
||||
if ((x < zero) || ( a <= zero)) {
|
||||
if ((x < zero) || (a <= zero)) {
|
||||
// domain error
|
||||
return nan;
|
||||
}
|
||||
|
||||
if ((x < one) || (x < a)) {
|
||||
return (one - igamma_impl<Scalar>::run(a, x));
|
||||
/* The checks above ensure that we meet the preconditions for
|
||||
* igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
|
||||
* Calling Run() would also work, but in that case the compiler may not be
|
||||
* able to prove that igammac_impl::Run and igamma_impl::Run are not
|
||||
* mutually recursive. This leads to worse code, particularly on
|
||||
* platforms like nvptx, where recursion is allowed only begrudgingly.
|
||||
*/
|
||||
return (one - igamma_impl<Scalar>::Impl(a, x));
|
||||
}
|
||||
|
||||
return Impl(a, x);
|
||||
}
|
||||
|
||||
private:
|
||||
/* igamma_impl calls igammac_impl::Impl. */
|
||||
friend struct igamma_impl<Scalar>;
|
||||
|
||||
/* Actually computes igamc(a, x).
|
||||
*
|
||||
* Preconditions:
|
||||
* a > 0
|
||||
* x >= 1
|
||||
* x >= a
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
|
||||
const Scalar zero = 0;
|
||||
const Scalar one = 1;
|
||||
const Scalar two = 2;
|
||||
const Scalar machep = igamma_helper<Scalar>::machep();
|
||||
const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
|
||||
const Scalar big = igamma_helper<Scalar>::big();
|
||||
const Scalar biginv = 1 / big;
|
||||
const Scalar inf = NumTraits<Scalar>::infinity();
|
||||
|
||||
Scalar ans, ax, c, yc, r, t, y, z;
|
||||
Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
|
||||
|
||||
if (x == inf) return zero; // std::isinf crashes on CUDA
|
||||
|
||||
/* Compute x**a * exp(-x) / gamma(a) */
|
||||
@ -618,7 +641,7 @@ template <typename Scalar>
|
||||
struct igamma_impl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static Scalar run(Scalar a, Scalar x) {
|
||||
/* igam()
|
||||
/* igam()
|
||||
* Incomplete gamma integral
|
||||
*
|
||||
*
|
||||
@ -680,22 +703,47 @@ struct igamma_impl {
|
||||
*/
|
||||
const Scalar zero = 0;
|
||||
const Scalar one = 1;
|
||||
const Scalar machep = igamma_helper<Scalar>::machep();
|
||||
const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
|
||||
const Scalar nan = NumTraits<Scalar>::quiet_NaN();
|
||||
|
||||
double ans, ax, c, r;
|
||||
|
||||
if (x == zero) return zero;
|
||||
|
||||
if ((x < zero) || ( a <= zero)) { // domain error
|
||||
if ((x < zero) || (a <= zero)) { // domain error
|
||||
return nan;
|
||||
}
|
||||
|
||||
if ((x > one) && (x > a)) {
|
||||
return (one - igammac_impl<Scalar>::run(a, x));
|
||||
/* The checks above ensure that we meet the preconditions for
|
||||
* igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
|
||||
* Calling Run() would also work, but in that case the compiler may not be
|
||||
* able to prove that igammac_impl::Run and igamma_impl::Run are not
|
||||
* mutually recursive. This leads to worse code, particularly on
|
||||
* platforms like nvptx, where recursion is allowed only begrudgingly.
|
||||
*/
|
||||
return (one - igammac_impl<Scalar>::Impl(a, x));
|
||||
}
|
||||
|
||||
return Impl(a, x);
|
||||
}
|
||||
|
||||
private:
|
||||
/* igammac_impl calls igamma_impl::Impl. */
|
||||
friend struct igammac_impl<Scalar>;
|
||||
|
||||
/* Actually computes igam(a, x).
|
||||
*
|
||||
* Preconditions:
|
||||
* x > 0
|
||||
* a > 0
|
||||
* !(x > 1 && x > a)
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
|
||||
const Scalar zero = 0;
|
||||
const Scalar one = 1;
|
||||
const Scalar machep = igamma_helper<Scalar>::machep();
|
||||
const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
|
||||
|
||||
double ans, ax, c, r;
|
||||
|
||||
/* Compute x**a * exp(-x) / gamma(a) */
|
||||
ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
|
||||
if (ax < -maxlog) {
|
||||
|
@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const
|
||||
DerivedCopy copy(derived());
|
||||
|
||||
enum {
|
||||
CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
|
||||
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|
||||
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
|
||||
};
|
||||
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
|
||||
typename DerivedCopyClean
|
||||
::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
Index n = size();
|
||||
|
||||
if(n==1)
|
||||
|
@ -532,7 +532,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
template<typename RhsType, typename DstType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
|
||||
if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
|
||||
if(!internal::is_same_dense(dst,rhs))
|
||||
dst = rhs;
|
||||
this->solveInPlace(dst);
|
||||
}
|
||||
|
@ -70,12 +70,18 @@ struct half : public __half {
|
||||
|
||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||
: __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(unsigned int ui)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(ui))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(int i)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(unsigned long ul)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(ul))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(long l)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(long long ll)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(unsigned long long ull)
|
||||
: __half(internal::float_to_half_rtne(static_cast<float>(ull))) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(float f)
|
||||
: __half(internal::float_to_half_rtne(f)) {}
|
||||
explicit EIGEN_DEVICE_FUNC half(double d)
|
||||
@ -401,6 +407,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a)
|
||||
static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) {
|
||||
return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) {
|
||||
Eigen::half result;
|
||||
result.x = a.x & 0x7FFF;
|
||||
@ -418,6 +425,18 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::h
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
|
||||
return Eigen::half(::powf(float(a), float(b)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) {
|
||||
return Eigen::half(::sinf(float(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) {
|
||||
return Eigen::half(::cosf(float(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) {
|
||||
return Eigen::half(::tanf(float(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) {
|
||||
return Eigen::half(::tanhf(float(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
|
||||
return Eigen::half(::floorf(float(a)));
|
||||
}
|
||||
@ -425,6 +444,51 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::h
|
||||
return Eigen::half(::ceilf(float(a)));
|
||||
}
|
||||
|
||||
template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __hlt(b, a) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
const float f2 = static_cast<float>(b);
|
||||
return f2 < f1 ? b : a;
|
||||
#endif
|
||||
}
|
||||
template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __hlt(a, b) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
const float f2 = static_cast<float>(b);
|
||||
return f1 < f2 ? b : a;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_C99_MATH
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
|
||||
return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
|
||||
return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
|
||||
return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
|
||||
return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
|
||||
return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
|
||||
return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
|
||||
return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
|
||||
return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
|
||||
}
|
||||
#endif
|
||||
} // end namespace numext
|
||||
|
||||
} // end namespace Eigen
|
||||
@ -466,6 +530,11 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a
|
||||
|
||||
namespace std {
|
||||
|
||||
EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) {
|
||||
os << static_cast<float>(v);
|
||||
return os;
|
||||
}
|
||||
|
||||
#if __cplusplus > 199711L
|
||||
template <>
|
||||
struct hash<Eigen::half> {
|
||||
|
@ -344,6 +344,22 @@ template<> struct functor_traits<scalar_boolean_or_op> {
|
||||
};
|
||||
};
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the xor of two booleans
|
||||
*
|
||||
* \sa class CwiseBinaryOp, ArrayBase::operator^
|
||||
*/
|
||||
struct scalar_boolean_xor_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
|
||||
};
|
||||
template<> struct functor_traits<scalar_boolean_xor_op> {
|
||||
enum {
|
||||
Cost = NumTraits<bool>::AddCost,
|
||||
PacketAccess = false
|
||||
};
|
||||
};
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the incomplete gamma function igamma(a, x)
|
||||
*
|
||||
|
@ -234,9 +234,33 @@ template<typename Scalar> struct scalar_exp_op {
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct functor_traits<scalar_exp_op<Scalar> >
|
||||
{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_exp_op<Scalar> > {
|
||||
enum {
|
||||
PacketAccess = packet_traits<Scalar>::HasExp,
|
||||
// The following numbers are based on the AVX implementation.
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
// Haswell can issue 2 add/mul/madd per cycle.
|
||||
Cost =
|
||||
(sizeof(Scalar) == 4
|
||||
// float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
|
||||
? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost)
|
||||
// double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other
|
||||
: (14 * NumTraits<Scalar>::AddCost +
|
||||
6 * NumTraits<Scalar>::MulCost +
|
||||
NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
|
||||
#else
|
||||
Cost =
|
||||
(sizeof(Scalar) == 4
|
||||
// float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
|
||||
? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost)
|
||||
// double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other
|
||||
: (23 * NumTraits<Scalar>::AddCost +
|
||||
12 * NumTraits<Scalar>::MulCost +
|
||||
NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
@ -250,9 +274,24 @@ template<typename Scalar> struct scalar_log_op {
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct functor_traits<scalar_log_op<Scalar> >
|
||||
{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_log_op<Scalar> > {
|
||||
enum {
|
||||
PacketAccess = packet_traits<Scalar>::HasLog,
|
||||
Cost =
|
||||
(PacketAccess
|
||||
// The following numbers are based on the AVX implementation.
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
// 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
|
||||
? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost)
|
||||
#else
|
||||
// 8 pmadd, 6 pmul, 8 padd/psub, 20 other
|
||||
? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost)
|
||||
#endif
|
||||
// Measured cost of std::log.
|
||||
: sizeof(Scalar)==4 ? 40 : 85)
|
||||
};
|
||||
};
|
||||
|
||||
/** \internal
|
||||
*
|
||||
@ -280,10 +319,19 @@ template<typename Scalar> struct scalar_sqrt_op {
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
|
||||
};
|
||||
template<typename Scalar>
|
||||
struct functor_traits<scalar_sqrt_op<Scalar> >
|
||||
{ enum {
|
||||
Cost = 5 * NumTraits<Scalar>::MulCost,
|
||||
template <typename Scalar>
|
||||
struct functor_traits<scalar_sqrt_op<Scalar> > {
|
||||
enum {
|
||||
#if EIGEN_FAST_MATH
|
||||
// The following numbers are based on the AVX implementation.
|
||||
Cost = (sizeof(Scalar) == 8 ? 28
|
||||
// 4 pmul, 1 pmadd, 3 other
|
||||
: (3 * NumTraits<Scalar>::AddCost +
|
||||
5 * NumTraits<Scalar>::MulCost)),
|
||||
#else
|
||||
// The following numbers are based on min VSQRT throughput on Haswell.
|
||||
Cost = (sizeof(Scalar) == 8 ? 28 : 14),
|
||||
#endif
|
||||
PacketAccess = packet_traits<Scalar>::HasSqrt
|
||||
};
|
||||
};
|
||||
@ -313,7 +361,7 @@ struct functor_traits<scalar_rsqrt_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_cos_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
|
||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
|
||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
|
||||
};
|
||||
@ -332,7 +380,7 @@ struct functor_traits<scalar_cos_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_sin_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
|
||||
};
|
||||
@ -352,7 +400,7 @@ struct functor_traits<scalar_sin_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_tan_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
|
||||
};
|
||||
@ -371,7 +419,7 @@ struct functor_traits<scalar_tan_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_acos_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
|
||||
};
|
||||
@ -390,7 +438,7 @@ struct functor_traits<scalar_acos_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_asin_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
|
||||
};
|
||||
@ -546,7 +594,7 @@ struct functor_traits<scalar_erfc_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_atan_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
|
||||
};
|
||||
@ -566,7 +614,7 @@ struct functor_traits<scalar_atan_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_tanh_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
|
||||
};
|
||||
@ -574,8 +622,24 @@ template<typename Scalar>
|
||||
struct functor_traits<scalar_tanh_op<Scalar> >
|
||||
{
|
||||
enum {
|
||||
Cost = 5 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = packet_traits<Scalar>::HasTanh
|
||||
PacketAccess = packet_traits<Scalar>::HasTanh,
|
||||
Cost =
|
||||
(PacketAccess
|
||||
// The following numbers are based on the AVX implementation,
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
// Haswell can issue 2 add/mul/madd per cycle.
|
||||
// 9 pmadd, 2 pmul, 1 div, 2 other
|
||||
? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
|
||||
NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
|
||||
#else
|
||||
? (11 * NumTraits<Scalar>::AddCost +
|
||||
11 * NumTraits<Scalar>::MulCost +
|
||||
NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
|
||||
#endif
|
||||
// This number assumes a naive implementation of tanh
|
||||
: (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
|
||||
2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost +
|
||||
functor_traits<scalar_exp_op<Scalar> >::Cost))
|
||||
};
|
||||
};
|
||||
|
||||
@ -585,7 +649,7 @@ struct functor_traits<scalar_tanh_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_sinh_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
|
||||
};
|
||||
@ -604,7 +668,7 @@ struct functor_traits<scalar_sinh_op<Scalar> >
|
||||
*/
|
||||
template<typename Scalar> struct scalar_cosh_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
|
||||
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); }
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
|
||||
};
|
||||
|
@ -11,8 +11,8 @@
|
||||
#define EIGEN_GENERAL_BLOCK_PANEL_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
|
||||
@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
|
||||
#endif
|
||||
|
||||
/** \internal */
|
||||
struct CacheSizes {
|
||||
struct CacheSizes {
|
||||
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
|
||||
int l1CacheSize, l2CacheSize, l3CacheSize;
|
||||
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
|
||||
@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
|
||||
*
|
||||
* \sa setCpuCacheSizes */
|
||||
|
||||
template<typename LhsScalar, typename RhsScalar, int KcFactor>
|
||||
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
||||
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
@ -107,21 +107,17 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
enum {
|
||||
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
|
||||
k_mask = -8,
|
||||
|
||||
kr = 8,
|
||||
mr = Traits::mr,
|
||||
mr_mask = -mr,
|
||||
|
||||
nr = Traits::nr,
|
||||
nr_mask = -nr
|
||||
nr = Traits::nr
|
||||
};
|
||||
// Increasing k gives us more time to prefetch the content of the "C"
|
||||
// registers. However once the latency is hidden there is no point in
|
||||
// increasing the value of k, so we'll cap it at 320 (value determined
|
||||
// experimentally).
|
||||
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
|
||||
const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
|
||||
if (k_cache < k) {
|
||||
k = k_cache & k_mask;
|
||||
k = k_cache - (k_cache % kr);
|
||||
eigen_internal_assert(k > 0);
|
||||
}
|
||||
|
||||
@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
if (n_cache <= n_per_thread) {
|
||||
// Don't exceed the capacity of the l2 cache.
|
||||
eigen_internal_assert(n_cache >= static_cast<Index>(nr));
|
||||
n = n_cache & nr_mask;
|
||||
n = n_cache - (n_cache % nr);
|
||||
eigen_internal_assert(n > 0);
|
||||
} else {
|
||||
n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
|
||||
n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
|
||||
}
|
||||
|
||||
if (l3 > l2) {
|
||||
@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
|
||||
const Index m_per_thread = numext::div_ceil(m, num_threads);
|
||||
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
|
||||
m = m_cache & mr_mask;
|
||||
m = m_cache - (m_cache % mr);
|
||||
eigen_internal_assert(m > 0);
|
||||
} else {
|
||||
m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
|
||||
m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -156,29 +152,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
l2 = 32*1024;
|
||||
l3 = 512*1024;
|
||||
#endif
|
||||
|
||||
|
||||
// Early return for small problems because the computation below are time consuming for small problems.
|
||||
// Perhaps it would make more sense to consider k*n*m??
|
||||
// Note that for very tiny problem, this function should be bypassed anyway
|
||||
// because we use the coefficient-based implementation for them.
|
||||
if((std::max)(k,(std::max)(m,n))<48)
|
||||
if((numext::maxi)(k,(numext::maxi)(m,n))<48)
|
||||
return;
|
||||
|
||||
|
||||
typedef typename Traits::ResScalar ResScalar;
|
||||
enum {
|
||||
k_peeling = 8,
|
||||
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
||||
};
|
||||
|
||||
|
||||
// ---- 1st level of blocking on L1, yields kc ----
|
||||
|
||||
|
||||
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
|
||||
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
|
||||
// We also include a register-level block of the result (mx x nr).
|
||||
// (In an ideal world only the lhs panel would stay in L1)
|
||||
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
||||
const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
||||
const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
|
||||
const Index old_k = k;
|
||||
if(k>max_kc)
|
||||
{
|
||||
@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// while keeping the same number of sweeps over the result.
|
||||
k = (k%max_kc)==0 ? max_kc
|
||||
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
|
||||
|
||||
|
||||
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
||||
}
|
||||
|
||||
|
||||
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
||||
|
||||
|
||||
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
||||
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
||||
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
||||
@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
#else
|
||||
const Index actual_l2 = 1572864; // == 1.5 MB
|
||||
#endif
|
||||
|
||||
|
||||
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
||||
// The second half is implicitly reserved to access the result and lhs coefficients.
|
||||
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
|
||||
@ -223,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
||||
}
|
||||
// WARNING Below, we assume that Traits::nr is a power of two.
|
||||
Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
||||
Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
||||
if(n>nc)
|
||||
{
|
||||
// We are really blocking over the columns:
|
||||
@ -252,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
// we have both L2 and L3, and problem is small enough to be kept in L2
|
||||
// Let's choose m such that lhs's block fit in 1/3 of L2
|
||||
actual_lm = l2;
|
||||
max_mc = (std::min<Index>)(576,max_mc);
|
||||
max_mc = (numext::mini<Index>)(576,max_mc);
|
||||
}
|
||||
Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
||||
Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
||||
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
||||
else if (mc==0) return;
|
||||
m = (m%mc)==0 ? mc
|
||||
@ -263,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Index>
|
||||
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
||||
{
|
||||
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
||||
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
|
||||
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
||||
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
||||
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
||||
k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
||||
m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
||||
n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
@ -296,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
||||
*
|
||||
* \sa setCpuCacheSizes */
|
||||
|
||||
template<typename LhsScalar, typename RhsScalar, int KcFactor>
|
||||
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
|
||||
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||
{
|
||||
if (!useSpecificBlockingSizes(k, m, n)) {
|
||||
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
|
||||
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
|
||||
}
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
@ -314,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
||||
if (n > nr) n -= n % nr;
|
||||
}
|
||||
|
||||
template<typename LhsScalar, typename RhsScalar>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index>
|
||||
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||
{
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
@ -2225,6 +2222,16 @@ inline std::ptrdiff_t l2CacheSize()
|
||||
return l2;
|
||||
}
|
||||
|
||||
/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
|
||||
rs.
|
||||
* \sa setCpuCacheSize */
|
||||
inline std::ptrdiff_t l3CacheSize()
|
||||
{
|
||||
std::ptrdiff_t l1, l2, l3;
|
||||
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||
return l3;
|
||||
}
|
||||
|
||||
/** Set the cpu L1 and L2 cache sizes (in bytes).
|
||||
* These values are use to adjust the size of the blocks
|
||||
* for the algorithms working per blocks.
|
||||
|
@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
|
||||
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
|
||||
const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
|
||||
{
|
||||
general_matrix_matrix_triangular_product<Index,
|
||||
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
|
||||
|
@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
|
||||
HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
|
||||
};
|
||||
static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
|
||||
};
|
||||
|
||||
template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
|
||||
EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
|
||||
::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
|
||||
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
|
||||
{
|
||||
static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
|
||||
Index size = (std::min)(_rows,_cols);
|
||||
|
@ -83,7 +83,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
// coherence when accessing the rhs elements
|
||||
std::ptrdiff_t l1, l2, l3;
|
||||
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
|
||||
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
|
||||
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
|
||||
|
||||
for(Index k2=IsLower ? 0 : size;
|
||||
|
@ -371,10 +371,10 @@
|
||||
// Does the compiler support const expressions?
|
||||
#ifdef __CUDACC__
|
||||
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
||||
#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500)
|
||||
#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||
#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||
EIGEN_GNUC_AT_LEAST(4,8)
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
@ -572,12 +572,12 @@ namespace Eigen {
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
// Static and dynamic alignment control
|
||||
//
|
||||
//
|
||||
// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
|
||||
// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
|
||||
// a default value is automatically computed based on architecture, compiler, and OS.
|
||||
//
|
||||
//
|
||||
// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
|
||||
// to be used to declare statically aligned buffers.
|
||||
//------------------------------------------------------------------------------------------
|
||||
@ -640,7 +640,7 @@ namespace Eigen {
|
||||
#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
|
||||
// Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
|
||||
|
||||
|
||||
// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
|
||||
// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
|
||||
// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
|
||||
@ -667,13 +667,13 @@ namespace Eigen {
|
||||
#else
|
||||
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
|
||||
#endif
|
||||
|
||||
|
||||
#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
|
||||
#else
|
||||
#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
|
||||
|
@ -243,8 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
|
||||
{
|
||||
workspace.resize(rows());
|
||||
Index vecs = m_length;
|
||||
if( internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
|
||||
&& internal::extract_data(dst) == internal::extract_data(m_vectors))
|
||||
if(is_same_dense(dst,m_vectors))
|
||||
{
|
||||
// in-place
|
||||
dst.diagonal().setOnes();
|
||||
|
@ -231,6 +231,15 @@ template<typename _MatrixType> class FullPivLU
|
||||
return Solve<FullPivLU, Rhs>(*this, b.derived());
|
||||
}
|
||||
|
||||
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
|
||||
the LU decomposition.
|
||||
*/
|
||||
inline RealScalar rcond() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
|
||||
return internal::rcond_estimate_helper(m_l1_norm, *this);
|
||||
}
|
||||
|
||||
/** \returns the determinant of the matrix of which
|
||||
* *this is the LU decomposition. It has only linear complexity
|
||||
* (that is, O(n) where n is the dimension of the square matrix)
|
||||
@ -410,6 +419,7 @@ template<typename _MatrixType> class FullPivLU
|
||||
IntColVectorType m_rowsTranspositions;
|
||||
IntRowVectorType m_colsTranspositions;
|
||||
Index m_det_pq, m_nonzero_pivots;
|
||||
RealScalar m_l1_norm;
|
||||
RealScalar m_maxpivot, m_prescribedThreshold;
|
||||
bool m_isInitialized, m_usePrescribedThreshold;
|
||||
};
|
||||
@ -455,11 +465,12 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>
|
||||
// the permutations are stored as int indices, so just to be sure:
|
||||
eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
|
||||
|
||||
m_isInitialized = true;
|
||||
m_lu = matrix.derived();
|
||||
m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
|
||||
|
||||
computeInPlace();
|
||||
|
||||
m_isInitialized = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,6 @@ template<typename _MatrixType> class PartialPivLU
|
||||
typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
|
||||
typedef typename MatrixType::PlainObject PlainObject;
|
||||
|
||||
|
||||
/**
|
||||
* \brief Default Constructor.
|
||||
*
|
||||
@ -152,6 +151,15 @@ template<typename _MatrixType> class PartialPivLU
|
||||
return Solve<PartialPivLU, Rhs>(*this, b.derived());
|
||||
}
|
||||
|
||||
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
|
||||
the LU decomposition.
|
||||
*/
|
||||
inline RealScalar rcond() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
|
||||
return internal::rcond_estimate_helper(m_l1_norm, *this);
|
||||
}
|
||||
|
||||
/** \returns the inverse of the matrix of which *this is the LU decomposition.
|
||||
*
|
||||
* \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
|
||||
@ -178,7 +186,7 @@ template<typename _MatrixType> class PartialPivLU
|
||||
*
|
||||
* \sa MatrixBase::determinant()
|
||||
*/
|
||||
typename internal::traits<MatrixType>::Scalar determinant() const;
|
||||
Scalar determinant() const;
|
||||
|
||||
MatrixType reconstructedMatrix() const;
|
||||
|
||||
@ -247,6 +255,7 @@ template<typename _MatrixType> class PartialPivLU
|
||||
PermutationType m_p;
|
||||
TranspositionType m_rowsTranspositions;
|
||||
Index m_det_p;
|
||||
RealScalar m_l1_norm;
|
||||
bool m_isInitialized;
|
||||
};
|
||||
|
||||
@ -256,6 +265,7 @@ PartialPivLU<MatrixType>::PartialPivLU()
|
||||
m_p(),
|
||||
m_rowsTranspositions(),
|
||||
m_det_p(0),
|
||||
m_l1_norm(0),
|
||||
m_isInitialized(false)
|
||||
{
|
||||
}
|
||||
@ -266,6 +276,7 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
|
||||
m_p(size),
|
||||
m_rowsTranspositions(size),
|
||||
m_det_p(0),
|
||||
m_l1_norm(0),
|
||||
m_isInitialized(false)
|
||||
{
|
||||
}
|
||||
@ -277,6 +288,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
|
||||
m_p(matrix.rows()),
|
||||
m_rowsTranspositions(matrix.rows()),
|
||||
m_det_p(0),
|
||||
m_l1_norm(0),
|
||||
m_isInitialized(false)
|
||||
{
|
||||
compute(matrix.derived());
|
||||
@ -467,6 +479,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
|
||||
eigen_assert(matrix.rows()<NumTraits<int>::highest());
|
||||
|
||||
m_lu = matrix.derived();
|
||||
m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
|
||||
|
||||
eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
|
||||
const Index size = matrix.rows();
|
||||
@ -484,7 +497,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
|
||||
}
|
||||
|
||||
template<typename MatrixType>
|
||||
typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
|
||||
typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
|
||||
return Scalar(m_det_p) * m_lu.diagonal().prod();
|
||||
|
@ -397,6 +397,10 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
|
||||
|
||||
const Index rank = m_cpqr.rank();
|
||||
const Index cols = matrix.cols();
|
||||
const Index rows = matrix.rows();
|
||||
m_zCoeffs.resize((std::min)(rows, cols));
|
||||
m_temp.resize(cols);
|
||||
|
||||
if (rank < cols) {
|
||||
// We have reduced the (permuted) matrix to the form
|
||||
// [R11 R12]
|
||||
|
@ -350,7 +350,8 @@ template<typename MatrixType, int QRPreconditioner>
|
||||
struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
|
||||
{
|
||||
typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
|
||||
static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; }
|
||||
};
|
||||
|
||||
template<typename MatrixType, int QRPreconditioner>
|
||||
@ -359,19 +360,30 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
|
||||
typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
|
||||
static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry)
|
||||
{
|
||||
using std::sqrt;
|
||||
using std::abs;
|
||||
Scalar z;
|
||||
JacobiRotation<Scalar> rot;
|
||||
RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
|
||||
|
||||
|
||||
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||
const RealScalar precision = NumTraits<Scalar>::epsilon();
|
||||
|
||||
if(n==0)
|
||||
{
|
||||
z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
|
||||
work_matrix.row(p) *= z;
|
||||
if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
|
||||
if(work_matrix.coeff(q,q)!=Scalar(0))
|
||||
// make sure first column is zero
|
||||
work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0);
|
||||
|
||||
if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
|
||||
{
|
||||
// work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n
|
||||
z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
|
||||
work_matrix.row(p) *= z;
|
||||
if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
|
||||
}
|
||||
if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
|
||||
{
|
||||
z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
|
||||
work_matrix.row(q) *= z;
|
||||
@ -385,19 +397,25 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
|
||||
rot.s() = work_matrix.coeff(q,p) / n;
|
||||
work_matrix.applyOnTheLeft(p,q,rot);
|
||||
if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
|
||||
if(work_matrix.coeff(p,q) != Scalar(0))
|
||||
if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
|
||||
{
|
||||
z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
|
||||
work_matrix.col(q) *= z;
|
||||
if(svd.computeV()) svd.m_matrixV.col(q) *= z;
|
||||
}
|
||||
if(work_matrix.coeff(q,q) != Scalar(0))
|
||||
if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
|
||||
{
|
||||
z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
|
||||
work_matrix.row(q) *= z;
|
||||
if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
|
||||
}
|
||||
}
|
||||
|
||||
// update largest diagonal entry
|
||||
maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
|
||||
// and check whether the 2x2 block is already diagonal
|
||||
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
|
||||
return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold;
|
||||
}
|
||||
};
|
||||
|
||||
@ -414,7 +432,6 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
|
||||
JacobiRotation<RealScalar> rot1;
|
||||
RealScalar t = m.coeff(0,0) + m.coeff(1,1);
|
||||
RealScalar d = m.coeff(1,0) - m.coeff(0,1);
|
||||
|
||||
if(d == RealScalar(0))
|
||||
{
|
||||
rot1.s() = RealScalar(0);
|
||||
@ -707,6 +724,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
|
||||
}
|
||||
|
||||
/*** step 2. The main Jacobi SVD iteration. ***/
|
||||
RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff();
|
||||
|
||||
bool finished = false;
|
||||
while(!finished)
|
||||
@ -722,25 +740,27 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
|
||||
// if this 2x2 sub-matrix is not diagonal already...
|
||||
// notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
|
||||
// keep us iterating forever. Similarly, small denormal numbers are considered zero.
|
||||
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
|
||||
precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
|
||||
abs(m_workMatrix.coeff(q,q))));
|
||||
// We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
|
||||
RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
|
||||
if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
|
||||
{
|
||||
finished = false;
|
||||
|
||||
// perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
|
||||
internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
|
||||
JacobiRotation<RealScalar> j_left, j_right;
|
||||
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
|
||||
// the complex to real operation returns true is the updated 2x2 block is not already diagonal
|
||||
if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
|
||||
{
|
||||
JacobiRotation<RealScalar> j_left, j_right;
|
||||
internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
|
||||
|
||||
// accumulate resulting Jacobi rotations
|
||||
m_workMatrix.applyOnTheLeft(p,q,j_left);
|
||||
if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
|
||||
// accumulate resulting Jacobi rotations
|
||||
m_workMatrix.applyOnTheLeft(p,q,j_left);
|
||||
if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
|
||||
|
||||
m_workMatrix.applyOnTheRight(p,q,j_right);
|
||||
if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
|
||||
m_workMatrix.applyOnTheRight(p,q,j_right);
|
||||
if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
|
||||
|
||||
// keep track of the largest diagonal coefficient
|
||||
maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
|
||||
typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
|
||||
|
||||
class InnerIterator;
|
||||
// class ReverseInnerIterator;
|
||||
class ReverseInnerIterator;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
|
||||
|
@ -986,7 +986,7 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
|
||||
&m_sluStat, &info, Scalar());
|
||||
StatFree(&m_sluStat);
|
||||
|
||||
if(&x.coeffRef(0) != x_ref.data())
|
||||
if(x.derived().data() != x_ref.data())
|
||||
x = x_ref;
|
||||
|
||||
m_info = info==0 ? Success : NumericalIssue;
|
||||
|
@ -280,3 +280,21 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
|
||||
return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
|
||||
}
|
||||
|
||||
/** \returns an expression of the coefficient-wise ^ operator of *this and \a other
|
||||
*
|
||||
* \warning this operator is for expression of bool only.
|
||||
*
|
||||
* Example: \include Cwise_boolean_xor.cpp
|
||||
* Output: \verbinclude Cwise_boolean_xor.out
|
||||
*
|
||||
* \sa operator&&(), select()
|
||||
*/
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
|
||||
operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
|
||||
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
|
||||
return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived());
|
||||
}
|
||||
|
@ -22,7 +22,6 @@
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <CoreServices/CoreServices.h>
|
||||
#include <mach/mach_time.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
|
@ -201,9 +201,15 @@ template <typename Device, typename T> class BenchmarkSuite {
|
||||
size_b[1] = k_/2;
|
||||
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
Eigen::array<TensorIndex, 2> strides;
|
||||
strides[0] = 1;
|
||||
strides[1] = 2;
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
|
||||
#endif
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
|
@ -29,8 +29,8 @@ BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
//BM_FuncGPU(algebraicFunc);
|
||||
//BM_FuncGPU(transcendentalFunc);
|
||||
BM_FuncGPU(algebraicFunc);
|
||||
BM_FuncGPU(transcendentalFunc);
|
||||
BM_FuncGPU(rowReduction);
|
||||
BM_FuncGPU(colReduction);
|
||||
|
||||
@ -48,11 +48,11 @@ BM_FuncGPU(colReduction);
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||
|
||||
|
||||
/*BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
*/
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||
|
@ -159,7 +159,7 @@ int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, c
|
||||
return 0;
|
||||
|
||||
int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
|
||||
|
||||
|
||||
if(SIDE(*side)==LEFT)
|
||||
{
|
||||
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
|
||||
@ -385,7 +385,7 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const
|
||||
|
||||
int info = 0;
|
||||
if(UPLO(*uplo)==INVALID) info = 1;
|
||||
else if(OP(*op)==INVALID) info = 2;
|
||||
else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) ) info = 2;
|
||||
else if(*n<0) info = 3;
|
||||
else if(*k<0) info = 4;
|
||||
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7;
|
||||
@ -447,7 +447,7 @@ int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const
|
||||
|
||||
int info = 0;
|
||||
if(UPLO(*uplo)==INVALID) info = 1;
|
||||
else if(OP(*op)==INVALID) info = 2;
|
||||
else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) ) info = 2;
|
||||
else if(*n<0) info = 3;
|
||||
else if(*k<0) info = 4;
|
||||
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7;
|
||||
@ -609,7 +609,7 @@ int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const
|
||||
else
|
||||
if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
|
||||
else matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
|
||||
|
||||
|
||||
if(beta!=Scalar(0))
|
||||
{
|
||||
matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
|
||||
|
@ -17,6 +17,12 @@
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/QR>
|
||||
|
||||
template<typename MatrixType, int UpLo>
|
||||
typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
|
||||
MatrixType symm = m.template selfadjointView<UpLo>();
|
||||
return symm.cwiseAbs().colwise().sum().maxCoeff();
|
||||
}
|
||||
|
||||
template<typename MatrixType,template <typename,int> class CholType> void test_chol_update(const MatrixType& symm)
|
||||
{
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
@ -77,7 +83,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
{
|
||||
SquareMatrixType symmUp = symm.template triangularView<Upper>();
|
||||
SquareMatrixType symmLo = symm.template triangularView<Lower>();
|
||||
|
||||
|
||||
LLT<SquareMatrixType,Lower> chollo(symmLo);
|
||||
VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
|
||||
vecX = chollo.solve(vecB);
|
||||
@ -85,6 +91,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
matX = chollo.solve(matB);
|
||||
VERIFY_IS_APPROX(symm * matX, matB);
|
||||
|
||||
const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols));
|
||||
RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
|
||||
matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
|
||||
RealScalar rcond_est = chollo.rcond();
|
||||
// Verify that the estimated condition number is within a factor of 10 of the
|
||||
// truth.
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
// test the upper mode
|
||||
LLT<SquareMatrixType,Upper> cholup(symmUp);
|
||||
VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
|
||||
@ -93,6 +107,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
matX = cholup.solve(matB);
|
||||
VERIFY_IS_APPROX(symm * matX, matB);
|
||||
|
||||
// Verify that the estimated condition number is within a factor of 10 of the
|
||||
// truth.
|
||||
const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols));
|
||||
rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
|
||||
matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
|
||||
rcond_est = cholup.rcond();
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
|
||||
MatrixType neg = -symmLo;
|
||||
chollo.compute(neg);
|
||||
VERIFY(chollo.info()==NumericalIssue);
|
||||
@ -101,7 +124,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL()));
|
||||
VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU()));
|
||||
VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL()));
|
||||
|
||||
|
||||
// test some special use cases of SelfCwiseBinaryOp:
|
||||
MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols);
|
||||
m2 = m1;
|
||||
@ -137,6 +160,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
matX = ldltlo.solve(matB);
|
||||
VERIFY_IS_APPROX(symm * matX, matB);
|
||||
|
||||
const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols));
|
||||
RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
|
||||
matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
|
||||
RealScalar rcond_est = ldltlo.rcond();
|
||||
// Verify that the estimated condition number is within a factor of 10 of the
|
||||
// truth.
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
|
||||
LDLT<SquareMatrixType,Upper> ldltup(symmUp);
|
||||
VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix());
|
||||
vecX = ldltup.solve(vecB);
|
||||
@ -144,6 +176,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
matX = ldltup.solve(matB);
|
||||
VERIFY_IS_APPROX(symm * matX, matB);
|
||||
|
||||
// Verify that the estimated condition number is within a factor of 10 of the
|
||||
// truth.
|
||||
const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols));
|
||||
rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
|
||||
matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
|
||||
rcond_est = ldltup.rcond();
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU()));
|
||||
VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL()));
|
||||
VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU()));
|
||||
@ -167,7 +207,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
// restore
|
||||
if(sign == -1)
|
||||
symm = -symm;
|
||||
|
||||
|
||||
// check matrices coming from linear constraints with Lagrange multipliers
|
||||
if(rows>=3)
|
||||
{
|
||||
@ -183,7 +223,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
vecX = ldltlo.solve(vecB);
|
||||
VERIFY_IS_APPROX(A * vecX, vecB);
|
||||
}
|
||||
|
||||
|
||||
// check non-full rank matrices
|
||||
if(rows>=3)
|
||||
{
|
||||
@ -199,7 +239,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
vecX = ldltlo.solve(vecB);
|
||||
VERIFY_IS_APPROX(A * vecX, vecB);
|
||||
}
|
||||
|
||||
|
||||
// check matrices with a wide spectrum
|
||||
if(rows>=3)
|
||||
{
|
||||
@ -225,7 +265,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
|
||||
{
|
||||
RealScalar large_tol = std::sqrt(test_precision<RealScalar>());
|
||||
VERIFY((A * vecX).isApprox(vecB, large_tol));
|
||||
|
||||
|
||||
++g_test_level;
|
||||
VERIFY_IS_APPROX(A * vecX,vecB);
|
||||
--g_test_level;
|
||||
@ -314,14 +354,14 @@ template<typename MatrixType> void cholesky_bug241(const MatrixType& m)
|
||||
}
|
||||
|
||||
// LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal.
|
||||
// This test checks that LDLT reports correctly that matrix is indefinite.
|
||||
// This test checks that LDLT reports correctly that matrix is indefinite.
|
||||
// See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736
|
||||
template<typename MatrixType> void cholesky_definiteness(const MatrixType& m)
|
||||
{
|
||||
eigen_assert(m.rows() == 2 && m.cols() == 2);
|
||||
MatrixType mat;
|
||||
LDLT<MatrixType> ldlt(2);
|
||||
|
||||
|
||||
{
|
||||
mat << 1, 0, 0, -1;
|
||||
ldlt.compute(mat);
|
||||
@ -384,11 +424,11 @@ void test_cholesky()
|
||||
CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) );
|
||||
CALL_SUBTEST_4( cholesky(Matrix3f()) );
|
||||
CALL_SUBTEST_5( cholesky(Matrix4d()) );
|
||||
|
||||
s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
|
||||
|
||||
s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
|
||||
CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) );
|
||||
TEST_SET_BUT_UNUSED_VARIABLE(s)
|
||||
|
||||
|
||||
s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
|
||||
CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) );
|
||||
TEST_SET_BUT_UNUSED_VARIABLE(s)
|
||||
@ -402,6 +442,6 @@ void test_cholesky()
|
||||
// Test problem size constructors
|
||||
CALL_SUBTEST_9( LLT<MatrixXf>(10) );
|
||||
CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
|
||||
|
||||
|
||||
TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
|
||||
}
|
||||
|
23
test/lu.cpp
23
test/lu.cpp
@ -11,6 +11,11 @@
|
||||
#include <Eigen/LU>
|
||||
using namespace std;
|
||||
|
||||
template<typename MatrixType>
|
||||
typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
|
||||
return m.cwiseAbs().colwise().sum().maxCoeff();
|
||||
}
|
||||
|
||||
template<typename MatrixType> void lu_non_invertible()
|
||||
{
|
||||
typedef typename MatrixType::Index Index;
|
||||
@ -143,7 +148,14 @@ template<typename MatrixType> void lu_invertible()
|
||||
m3 = MatrixType::Random(size,size);
|
||||
m2 = lu.solve(m3);
|
||||
VERIFY_IS_APPROX(m3, m1*m2);
|
||||
VERIFY_IS_APPROX(m2, lu.inverse()*m3);
|
||||
MatrixType m1_inverse = lu.inverse();
|
||||
VERIFY_IS_APPROX(m2, m1_inverse*m3);
|
||||
|
||||
RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
|
||||
const RealScalar rcond_est = lu.rcond();
|
||||
// Verify that the estimated condition number is within a factor of 10 of the
|
||||
// truth.
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
// test solve with transposed
|
||||
lu.template _solve_impl_transposed<false>(m3, m2);
|
||||
@ -170,6 +182,7 @@ template<typename MatrixType> void lu_partial_piv()
|
||||
PartialPivLU.h
|
||||
*/
|
||||
typedef typename MatrixType::Index Index;
|
||||
typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
|
||||
Index size = internal::random<Index>(1,4);
|
||||
|
||||
MatrixType m1(size, size), m2(size, size), m3(size, size);
|
||||
@ -181,7 +194,13 @@ template<typename MatrixType> void lu_partial_piv()
|
||||
m3 = MatrixType::Random(size,size);
|
||||
m2 = plu.solve(m3);
|
||||
VERIFY_IS_APPROX(m3, m1*m2);
|
||||
VERIFY_IS_APPROX(m2, plu.inverse()*m3);
|
||||
MatrixType m1_inverse = plu.inverse();
|
||||
VERIFY_IS_APPROX(m2, m1_inverse*m3);
|
||||
|
||||
RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
|
||||
const RealScalar rcond_est = plu.rcond();
|
||||
// Verify that the estimate is within a factor of 10 of the truth.
|
||||
VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
|
||||
|
||||
// test solve with transposed
|
||||
plu.template _solve_impl_transposed<false>(m3, m2);
|
||||
|
16
test/main.h
16
test/main.h
@ -275,6 +275,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
|
||||
|
||||
#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
|
||||
|
||||
#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b))
|
||||
#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
|
||||
|
||||
|
||||
#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
|
||||
#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b))
|
||||
#define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b))
|
||||
@ -316,9 +320,9 @@ inline bool test_isMuchSmallerThan(const float& a, const float& b)
|
||||
{ return internal::isMuchSmallerThan(a, b, test_precision<float>()); }
|
||||
inline bool test_isApproxOrLessThan(const float& a, const float& b)
|
||||
{ return internal::isApproxOrLessThan(a, b, test_precision<float>()); }
|
||||
|
||||
inline bool test_isApprox(const double& a, const double& b)
|
||||
{ return internal::isApprox(a, b, test_precision<double>()); }
|
||||
|
||||
inline bool test_isMuchSmallerThan(const double& a, const double& b)
|
||||
{ return internal::isMuchSmallerThan(a, b, test_precision<double>()); }
|
||||
inline bool test_isApproxOrLessThan(const double& a, const double& b)
|
||||
@ -359,6 +363,12 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b)
|
||||
{ return internal::isApproxOrLessThan(a, b, test_precision<long double>()); }
|
||||
#endif // EIGEN_TEST_NO_LONGDOUBLE
|
||||
|
||||
inline bool test_isApprox(const half& a, const half& b)
|
||||
{ return internal::isApprox(a, b, test_precision<half>()); }
|
||||
inline bool test_isMuchSmallerThan(const half& a, const half& b)
|
||||
{ return internal::isMuchSmallerThan(a, b, test_precision<half>()); }
|
||||
inline bool test_isApproxOrLessThan(const half& a, const half& b)
|
||||
{ return internal::isApproxOrLessThan(a, b, test_precision<half>()); }
|
||||
|
||||
// test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
|
||||
template<typename T1,typename T2>
|
||||
@ -426,9 +436,7 @@ template<typename T1,typename T2>
|
||||
typename NumTraits<T1>::Real test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
|
||||
{
|
||||
typedef typename NumTraits<T1>::Real RealScalar;
|
||||
using std::min;
|
||||
using std::sqrt;
|
||||
return sqrt(RealScalar(numext::abs2(a-b))/RealScalar((min)(numext::abs2(a),numext::abs2(b))));
|
||||
return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b))));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
|
@ -148,10 +148,14 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
|
||||
VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast<CD>().eval()*mcd);
|
||||
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
|
||||
|
||||
VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
|
||||
VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
|
||||
VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
|
||||
VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
|
||||
VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(), sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
|
||||
VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
|
||||
VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
|
||||
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
|
||||
VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
|
||||
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
|
||||
|
||||
// Not supported yet: trmm
|
||||
// VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(), sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
|
||||
|
@ -71,7 +71,7 @@ void test_product_large()
|
||||
std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
|
||||
std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
|
||||
// only makes sure it compiles fine
|
||||
internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
|
||||
internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1);
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -29,6 +29,9 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
|
||||
{
|
||||
mask( check_in_range(x,y)-x )++;
|
||||
}
|
||||
for(Index i=0; i<mask.size(); ++i)
|
||||
if(mask(i)==0)
|
||||
std::cout << "WARNING: value " << x+i << " not reached." << std::endl;
|
||||
VERIFY( (mask>0).all() );
|
||||
}
|
||||
|
||||
|
@ -80,6 +80,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
|
||||
Index i = internal::random<Index>(0,m.rows()-1);
|
||||
Index j = internal::random<Index>(0,m.cols()-1);
|
||||
m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
|
||||
if(NumTraits<Scalar>::IsComplex)
|
||||
*(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -91,8 +93,14 @@ void svd_fill_random(MatrixType &m, int Option = 0)
|
||||
if(!(dup && unit_uv))
|
||||
{
|
||||
Index n = internal::random<Index>(0,m.size()-1);
|
||||
for(Index i=0; i<n; ++i)
|
||||
m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = samples(internal::random<Index>(0,samples.size()-1));
|
||||
for(Index k=0; k<n; ++k)
|
||||
{
|
||||
Index i = internal::random<Index>(0,m.rows()-1);
|
||||
Index j = internal::random<Index>(0,m.cols()-1);
|
||||
m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
|
||||
if(NumTraits<Scalar>::IsComplex)
|
||||
*(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -74,10 +74,13 @@ template<typename MatrixType> void swap(const MatrixType& m)
|
||||
m1 = m1_copy;
|
||||
m3 = m3_copy;
|
||||
|
||||
// test assertion on mismatching size -- matrix case
|
||||
VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
|
||||
// test assertion on mismatching size -- xpr case
|
||||
VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
|
||||
if(m1.rows()>1)
|
||||
{
|
||||
// test assertion on mismatching size -- matrix case
|
||||
VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
|
||||
// test assertion on mismatching size -- xpr case
|
||||
VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
|
||||
}
|
||||
}
|
||||
|
||||
void test_swap()
|
||||
|
@ -22,7 +22,11 @@ template<typename Dst, typename Src>
|
||||
bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
|
||||
{
|
||||
typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar> > traits;
|
||||
bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
|
||||
bool res = traits::Traversal==traversal;
|
||||
if(unrolling==InnerUnrolling+CompleteUnrolling)
|
||||
res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
|
||||
else
|
||||
res = res && int(traits::Unrolling)==unrolling;
|
||||
if(!res)
|
||||
{
|
||||
std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
|
||||
@ -147,10 +151,10 @@ struct vectorization_logic
|
||||
|
||||
VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
|
||||
InnerVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
|
||||
VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1),
|
||||
InnerVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
|
||||
if(PacketSize>1)
|
||||
{
|
||||
typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
|
||||
@ -158,17 +162,29 @@ struct vectorization_logic
|
||||
LinearTraversal,CompleteUnrolling));
|
||||
VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
|
||||
LinearTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
|
||||
PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
|
||||
|
||||
|
||||
VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
|
||||
HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling));
|
||||
|
||||
|
||||
VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
|
||||
DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
||||
InnerVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
|
||||
InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
|
||||
}
|
||||
|
||||
|
||||
VERIFY(test_redux(Vector1(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix3(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
@ -226,6 +242,7 @@ struct vectorization_logic_half
|
||||
typedef Matrix<Scalar,PacketSize,1> Vector1;
|
||||
typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
|
||||
typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
|
||||
typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
|
||||
typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
|
||||
// typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
|
||||
// typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
|
||||
@ -291,12 +308,24 @@ struct vectorization_logic_half
|
||||
|
||||
VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
|
||||
DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
||||
InnerVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
|
||||
InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
|
||||
}
|
||||
|
||||
VERIFY(test_redux(Vector1(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix3(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix57(),
|
||||
VERIFY(test_redux(Matrix35(),
|
||||
LinearVectorizedTraversal,CompleteUnrolling));
|
||||
|
||||
VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0),
|
||||
|
@ -1,4 +1,4 @@
|
||||
set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry)
|
||||
set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
|
||||
|
||||
install(FILES
|
||||
${Eigen_CXX11_HEADERS}
|
||||
|
@ -1,51 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_CORE_MODULE
|
||||
#define EIGEN_CXX11_CORE_MODULE
|
||||
|
||||
#include <Eigen/Core>
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
/** \defgroup CXX11_Core_Module C++11 Core Module
|
||||
*
|
||||
* This module provides common core features for all modules that
|
||||
* explicitly depend on C++11. Currently, this is only the Tensor
|
||||
* module. Note that at this stage, you should not need to include
|
||||
* this module directly.
|
||||
*
|
||||
* It also provides a limited fallback for compilers that don't support
|
||||
* CXX11 yet, such as nvcc.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/CXX11/Core>
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "src/Core/util/EmulateArray.h"
|
||||
#include "src/Core/util/MaxSizeVector.h"
|
||||
|
||||
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
|
||||
// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
|
||||
// supports enough of the standard for our needs
|
||||
#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
|
||||
#include "src/Core/util/CXX11Workarounds.h"
|
||||
#include "src/Core/util/CXX11Meta.h"
|
||||
#else
|
||||
#include "src/Core/util/EmulateCXX11Meta.h"
|
||||
#endif
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_CXX11_CORE_MODULE
|
||||
|
@ -11,10 +11,12 @@
|
||||
//#ifndef EIGEN_CXX11_TENSOR_MODULE
|
||||
//#define EIGEN_CXX11_TENSOR_MODULE
|
||||
|
||||
#include "Core"
|
||||
#include "../../../Eigen/Core"
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
#include "src/util/CXX11Meta.h"
|
||||
#include "src/util/MaxSizeVector.h"
|
||||
|
||||
/** \defgroup CXX11_Tensor_Module Tensor Module
|
||||
*
|
||||
@ -26,6 +28,7 @@
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
|
||||
@ -51,11 +54,7 @@ typedef unsigned __int64 uint64_t;
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include "ThreadPool"
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
@ -84,6 +83,7 @@ typedef unsigned __int64 uint64_t;
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
|
||||
#include "src/Tensor/TensorCostModel.h"
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
#include "src/Tensor/TensorReduction.h"
|
||||
|
@ -14,6 +14,8 @@
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
#include "src/util/CXX11Meta.h"
|
||||
|
||||
/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
|
||||
*
|
||||
* This module provides a classes that allow for the definition of
|
||||
|
65
unsupported/Eigen/CXX11/ThreadPool
Normal file
65
unsupported/Eigen/CXX11/ThreadPool
Normal file
@ -0,0 +1,65 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_MODULE
|
||||
#define EIGEN_CXX11_THREADPOOL_MODULE
|
||||
|
||||
#include "../../../Eigen/Core"
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
|
||||
*
|
||||
* This module provides 2 threadpool implementations
|
||||
* - a simple reference implementation
|
||||
* - a faster non blocking implementation
|
||||
*
|
||||
* This module requires C++11.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/CXX11/ThreadPool>
|
||||
* \endcode
|
||||
*/
|
||||
|
||||
|
||||
// The code depends on CXX11, so only include the module if the
|
||||
// compiler supports it.
|
||||
#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
#include "src/util/CXX11Meta.h"
|
||||
#include "src/util/MaxSizeVector.h"
|
||||
|
||||
#include "src/ThreadPool/ThreadLocal.h"
|
||||
#include "src/ThreadPool/ThreadYield.h"
|
||||
#include "src/ThreadPool/EventCount.h"
|
||||
#include "src/ThreadPool/RunQueue.h"
|
||||
#include "src/ThreadPool/ThreadPoolInterface.h"
|
||||
#include "src/ThreadPool/ThreadEnvironment.h"
|
||||
#include "src/ThreadPool/SimpleThreadPool.h"
|
||||
#include "src/ThreadPool/NonBlockingThreadPool.h"
|
||||
|
||||
#endif
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_MODULE
|
||||
|
@ -1,3 +1,4 @@
|
||||
add_subdirectory(Core)
|
||||
add_subdirectory(util)
|
||||
add_subdirectory(ThreadPool)
|
||||
add_subdirectory(Tensor)
|
||||
add_subdirectory(TensorSymmetry)
|
||||
|
@ -1 +0,0 @@
|
||||
add_subdirectory(util)
|
@ -1,6 +0,0 @@
|
||||
FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h")
|
||||
|
||||
INSTALL(FILES
|
||||
${Eigen_CXX11_Core_util_SRCS}
|
||||
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel
|
||||
)
|
@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
|
||||
return CoeffReturnType(index, m_impl.coeff(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -89,6 +89,12 @@ template<typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
{
|
||||
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
|
||||
@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
{
|
||||
// The dimensions of the lhs and the rhs tensors should be equal to prevent
|
||||
@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
return m_leftImpl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here, but reduce left
|
||||
// cost by one load because we are using m_leftImpl.coeffRef.
|
||||
TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
|
||||
return m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(
|
||||
numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
|
||||
left.bytes_stored(), left.compute_cycles()) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
|
||||
|
||||
private:
|
||||
|
@ -334,6 +334,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
|
||||
return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
|
||||
}
|
||||
|
||||
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
|
||||
operator^(const OtherDerived& other) const {
|
||||
return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
|
||||
}
|
||||
|
||||
// Comparisons and tests.
|
||||
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
|
||||
|
@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < packetSize; ++i) {
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffColMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index originalIndex = index;
|
||||
|
||||
@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
|
||||
// Todo: this could be extended to the second dimension if we're not
|
||||
// broadcasting alongside the first dimension, and so on.
|
||||
if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
|
||||
if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
|
||||
return m_impl.template packet<Unaligned>(inputIndex);
|
||||
} else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndex);
|
||||
for (int i = 1; i < packetSize; ++i) {
|
||||
for (int i = 1; i < PacketSize; ++i) {
|
||||
values[i] = coeffRowMajor(originalIndex+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = TensorOpCost::AddCost<Index>();
|
||||
if (NumDims > 0) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
compute_cost += TensorOpCost::DivCost<Index>();
|
||||
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
|
||||
compute_cost += TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>() +
|
||||
TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
compute_cost +=
|
||||
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
|
@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
@ -180,9 +184,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
m_inputOffset = m_stride * op.offset();
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -202,17 +203,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(m_stride == 1);
|
||||
Index inputIndex = index * m_inputStride + m_inputOffset;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = m_impl.coeff(inputIndex);
|
||||
inputIndex += m_inputStride;
|
||||
}
|
||||
@ -226,13 +226,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
} else {
|
||||
const Index idx = index / m_stride;
|
||||
const Index rem = index - idx * m_stride;
|
||||
if (rem + packetSize <= m_stride) {
|
||||
if (rem + PacketSize <= m_stride) {
|
||||
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
|
||||
return m_impl.template packet<LoadMode>(inputIndex);
|
||||
} else {
|
||||
// Cross the stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index);
|
||||
++index;
|
||||
}
|
||||
@ -242,6 +242,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
double cost = 0;
|
||||
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1)) {
|
||||
cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
|
||||
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
|
||||
m_dim.actualDim() == NumInputDims - 1) ||
|
||||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
|
||||
m_dim.actualDim() == 0)) {
|
||||
cost += TensorOpCost::AddCost<Index>();
|
||||
} else {
|
||||
cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
|
||||
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
|
||||
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
|
||||
@ -298,6 +320,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -309,9 +334,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
@ -320,17 +342,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
|
||||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
eigen_assert(this->m_stride == 1);
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->m_impl.coeffRef(inputIndex) = values[i];
|
||||
inputIndex += this->m_inputStride;
|
||||
}
|
||||
@ -342,14 +363,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
} else {
|
||||
const Index idx = index / this->m_stride;
|
||||
const Index rem = index - idx * this->m_stride;
|
||||
if (rem + packetSize <= this->m_stride) {
|
||||
if (rem + PacketSize <= this->m_stride) {
|
||||
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
|
||||
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
|
||||
} else {
|
||||
// Cross stride boundary. Fallback to slow path.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index) = values[i];
|
||||
++index;
|
||||
}
|
||||
|
@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::ModCost<Index>());
|
||||
const double lhs_size = m_leftImpl.dimensions().TotalSize();
|
||||
const double rhs_size = m_rightImpl.dimensions().TotalSize();
|
||||
return (lhs_size / (lhs_size + rhs_size)) *
|
||||
m_leftImpl.costPerCoeff(vectorized) +
|
||||
(rhs_size / (lhs_size + rhs_size)) *
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -426,112 +426,6 @@ struct TensorContractionEvaluatorBase
|
||||
buffer, resIncr, alpha);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
|
||||
if (m_result != NULL) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
// Prevent assignment
|
||||
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
|
||||
Dimensions m_dimensions;
|
||||
|
||||
contract_t m_k_strides;
|
||||
contract_t m_left_contracting_strides;
|
||||
contract_t m_right_contracting_strides;
|
||||
|
||||
bool m_lhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_reordered;
|
||||
|
||||
left_nocontract_t m_i_strides;
|
||||
right_nocontract_t m_j_strides;
|
||||
left_nocontract_t m_left_nocontract_strides;
|
||||
right_nocontract_t m_right_nocontract_strides;
|
||||
|
||||
Index m_i_size;
|
||||
Index m_j_size;
|
||||
Index m_k_size;
|
||||
|
||||
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
|
||||
const Device& m_device;
|
||||
Scalar* m_result;
|
||||
};
|
||||
|
||||
|
||||
// evaluator for default device
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
|
||||
public TensorContractionEvaluatorBase<
|
||||
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
|
||||
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
|
||||
typedef TensorContractionEvaluatorBase<Self> Base;
|
||||
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
};
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
// will pretend B is LHS and A is RHS.
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
|
||||
static const int LDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
|
||||
static const int RDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
|
||||
static const int ContractDims = internal::array_size<Indices>::value;
|
||||
|
||||
typedef array<Index, ContractDims> contract_t;
|
||||
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
|
||||
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
|
||||
|
||||
static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
|
||||
|
||||
// Could we use NumDimensions here?
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) { }
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
|
||||
if (this->m_j_size == 1) {
|
||||
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
|
||||
// columns in left side, rows in right side
|
||||
@ -624,6 +518,116 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
this->m_device.deallocate(blockA);
|
||||
this->m_device.deallocate(blockB);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_leftImpl.cleanup();
|
||||
m_rightImpl.cleanup();
|
||||
|
||||
if (m_result != NULL) {
|
||||
m_device.deallocate(m_result);
|
||||
m_result = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
|
||||
return m_result[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
// Prevent assignment
|
||||
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
|
||||
Dimensions m_dimensions;
|
||||
|
||||
contract_t m_k_strides;
|
||||
contract_t m_left_contracting_strides;
|
||||
contract_t m_right_contracting_strides;
|
||||
|
||||
bool m_lhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_contiguous;
|
||||
bool m_rhs_inner_dim_reordered;
|
||||
|
||||
left_nocontract_t m_i_strides;
|
||||
right_nocontract_t m_j_strides;
|
||||
left_nocontract_t m_left_nocontract_strides;
|
||||
right_nocontract_t m_right_nocontract_strides;
|
||||
|
||||
Index m_i_size;
|
||||
Index m_j_size;
|
||||
Index m_k_size;
|
||||
|
||||
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
|
||||
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
|
||||
const Device& m_device;
|
||||
Scalar* m_result;
|
||||
};
|
||||
|
||||
|
||||
// evaluator for default device
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
|
||||
public TensorContractionEvaluatorBase<
|
||||
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
|
||||
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
|
||||
typedef TensorContractionEvaluatorBase<Self> Base;
|
||||
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
};
|
||||
|
||||
// Most of the code is assuming that both input tensors are ColMajor. If the
|
||||
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
|
||||
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
|
||||
// will pretend B is LHS and A is RHS.
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<
|
||||
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
|
||||
static const int LDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
|
||||
static const int RDims =
|
||||
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
|
||||
static const int ContractDims = internal::array_size<Indices>::value;
|
||||
|
||||
typedef array<Index, ContractDims> contract_t;
|
||||
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
|
||||
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
|
||||
|
||||
static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
|
||||
|
||||
// Could we use NumDimensions here?
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) { }
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
|
||||
if (this->m_j_size == 1) {
|
||||
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -35,9 +35,7 @@ class TensorContractionBlocking {
|
||||
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
|
||||
}
|
||||
else {
|
||||
if (kc_ && mc_ && nc_) {
|
||||
mc_ = (((m / num_threads) + 15) / 16) * 16;
|
||||
}
|
||||
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename TargetType, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
return converter.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
|
||||
if (vectorized) {
|
||||
const double SrcCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
|
||||
const double TgtCoeffRatio =
|
||||
internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
|
||||
return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
|
||||
TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
|
||||
} else {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
|
||||
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
|
||||
@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
|
||||
{
|
||||
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
Index indices[2] = {index, index+PacketSize-1};
|
||||
Index startInputs[2] = {0, 0};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
// We ignore the use of fused multiply-add.
|
||||
const double convolve_compute_cost =
|
||||
TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
|
||||
const double firstIndex_compute_cost =
|
||||
NumDims *
|
||||
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
|
||||
m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized,
|
||||
PacketSize));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -773,6 +790,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
|
||||
typedef typename InputArgType::Scalar Scalar;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
@ -1044,6 +1062,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
|
||||
// model.
|
||||
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
// We ignore the use of fused multiply-add.
|
||||
const double convolve_compute_cost =
|
||||
TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
|
||||
const double firstIndex_compute_cost =
|
||||
NumDims *
|
||||
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
|
||||
m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized,
|
||||
PacketSize));
|
||||
}
|
||||
|
||||
private:
|
||||
// No assignment (copies are needed by the kernels)
|
||||
TensorEvaluator& operator = (const TensorEvaluator&);
|
||||
|
214
unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
Normal file
214
unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
Normal file
@ -0,0 +1,214 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
||||
|
||||
//#if !defined(EIGEN_USE_GPU)
|
||||
//#define EIGEN_USE_COST_MODEL
|
||||
//#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorEvaluator
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief A cost model used to limit the number of threads used for evaluating
|
||||
* tensor expression.
|
||||
*
|
||||
*/
|
||||
|
||||
// Class storing the cost of evaluating a tensor expression in terms of the
|
||||
// estimated number of operand bytes loads, bytes stored, and compute cycles.
|
||||
class TensorOpCost {
|
||||
public:
|
||||
// TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
|
||||
// model based on minimal reciprocal throughput numbers from Intel or
|
||||
// Agner Fog's tables would be better than what is there now.
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_product_op<ArgType, ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
|
||||
return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_quotient_op<ArgType, ArgType>>::Cost;
|
||||
}
|
||||
template <typename ArgType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
|
||||
return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost;
|
||||
}
|
||||
template <typename SrcType, typename TargetType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
|
||||
return internal::functor_traits<
|
||||
internal::scalar_cast_op<SrcType, TargetType>>::Cost;
|
||||
}
|
||||
|
||||
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(compute_cycles) {}
|
||||
|
||||
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
|
||||
bool vectorized, double packet_size)
|
||||
: bytes_loaded_(bytes_loaded),
|
||||
bytes_stored_(bytes_stored),
|
||||
compute_cycles_(vectorized ? compute_cycles / packet_size
|
||||
: compute_cycles) {
|
||||
using std::isfinite;
|
||||
eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
|
||||
eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
|
||||
eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
|
||||
return bytes_loaded_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
|
||||
return bytes_stored_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
|
||||
return compute_cycles_;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
|
||||
double load_cost, double store_cost, double compute_cost) const {
|
||||
return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
|
||||
compute_cost * compute_cycles_;
|
||||
}
|
||||
|
||||
// Drop memory access component. Intended for cases when memory accesses are
|
||||
// sequential or are completely masked by computations.
|
||||
EIGEN_DEVICE_FUNC void dropMemoryCost() {
|
||||
bytes_loaded_ = 0;
|
||||
bytes_stored_ = 0;
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
|
||||
bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
|
||||
compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
|
||||
bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
|
||||
compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
|
||||
const TensorOpCost& rhs) {
|
||||
bytes_loaded_ += rhs.bytes_loaded();
|
||||
bytes_stored_ += rhs.bytes_stored();
|
||||
compute_cycles_ += rhs.compute_cycles();
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
|
||||
bytes_loaded_ *= rhs;
|
||||
bytes_stored_ *= rhs;
|
||||
compute_cycles_ *= rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
|
||||
TensorOpCost lhs, const TensorOpCost& rhs) {
|
||||
lhs += rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
TensorOpCost lhs, double rhs) {
|
||||
lhs *= rhs;
|
||||
return lhs;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
|
||||
double lhs, TensorOpCost rhs) {
|
||||
rhs *= lhs;
|
||||
return rhs;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
|
||||
return os << "[bytes_loaded = " << tc.bytes_loaded()
|
||||
<< ", bytes_stored = " << tc.bytes_stored()
|
||||
<< ", compute_cycles = " << tc.compute_cycles() << "]";
|
||||
}
|
||||
|
||||
private:
|
||||
double bytes_loaded_;
|
||||
double bytes_stored_;
|
||||
double compute_cycles_;
|
||||
};
|
||||
|
||||
// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
|
||||
// in [1:max_threads] instead of just switching multi-threading off for small
|
||||
// work units.
|
||||
template <typename Device>
|
||||
class TensorCostModel {
|
||||
public:
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
static const int kDeviceCyclesPerComputeCycle = 1;
|
||||
|
||||
// Costs in device cycles.
|
||||
static const int kStartupCycles = 100000;
|
||||
static const int kPerThreadCycles = 100000;
|
||||
static const int kTaskSize = 40000;
|
||||
|
||||
// Returns the number of threads in [1:max_threads] to use for
|
||||
// evaluating an expression with the given output size and cost per
|
||||
// coefficient.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
|
||||
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
|
||||
double cost = totalCost(output_size, cost_per_coeff);
|
||||
int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
|
||||
return numext::mini(max_threads, numext::maxi(1, threads));
|
||||
}
|
||||
|
||||
// taskSize assesses parallel task size.
|
||||
// Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
|
||||
// granularity needs to be increased to mitigate parallelization overheads.
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
return totalCost(output_size, cost_per_coeff) / kTaskSize;
|
||||
}
|
||||
|
||||
private:
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
|
||||
double output_size, const TensorOpCost& cost_per_coeff) {
|
||||
// Cost of memory fetches from L2 cache. 64 is typical cache line size.
|
||||
// 11 is L2 cache latency on Haswell.
|
||||
// We don't know whether data is in L1, L2 or L3. But we are most interested
|
||||
// in single-threaded computational time around 100us-10ms (smaller time
|
||||
// is too small for parallelization, larger time is not intersting
|
||||
// either because we are probably using all available threads already).
|
||||
// And for the target time range, L2 seems to be what matters. Data set
|
||||
// fitting into L1 is too small to take noticeable time. Data set fitting
|
||||
// only into L3 presumably will take more than 10ms to load and process.
|
||||
const double kLoadCycles = 1.0 / 64 * 11;
|
||||
const double kStoreCycles = 1.0 / 64 * 11;
|
||||
// Scaling from Eigen compute cost to device cycles.
|
||||
return output_size *
|
||||
cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
|
||||
kDeviceCyclesPerComputeCycle);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
|
@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
typedef typename internal::traits<ArgType>::Index Index;
|
||||
static const int NumDims = internal::traits<ArgType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef
|
||||
typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
|
||||
typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
m_dimensions = op.func().dimensions(op.expression());
|
||||
}
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
static const int NumDims = internal::traits<XprType>::NumDimensions;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
|
||||
}
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
|
||||
|
||||
protected:
|
||||
|
@ -291,15 +291,9 @@ struct GpuDevice {
|
||||
int max_blocks_;
|
||||
};
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
|
||||
assert(cudaGetLastError() == cudaSuccess);
|
||||
#else
|
||||
#define LAUNCH_CUDA_KERNEL(kernel, ...) \
|
||||
{ const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
|
||||
eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
|
||||
#endif
|
||||
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
|
@ -44,6 +44,26 @@ struct DefaultDevice {
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running on the host CPU
|
||||
return l1CacheSize();
|
||||
#else
|
||||
// Running on a CUDA device, return the amount of shared memory available.
|
||||
return 48*1024;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running single threaded on the host CPU
|
||||
return l3CacheSize();
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return firstLevelCacheSize();
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
// Running single threaded on the host CPU
|
||||
|
@ -12,145 +12,15 @@
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// This defines an interface that ThreadPoolDevice can take to use
|
||||
// custom thread pools underneath.
|
||||
class ThreadPoolInterface {
|
||||
public:
|
||||
virtual void Schedule(std::function<void()> fn) = 0;
|
||||
|
||||
virtual ~ThreadPoolInterface() {}
|
||||
};
|
||||
|
||||
// The implementation of the ThreadPool type ensures that the Schedule method
|
||||
// runs the functions it is provided in FIFO order when the scheduling is done
|
||||
// by a single thread.
|
||||
// Environment provides a way to create threads and also allows to intercept
|
||||
// task submission and execution.
|
||||
template <typename Environment>
|
||||
class ThreadPoolTempl : public ThreadPoolInterface {
|
||||
public:
|
||||
// Construct a pool that contains "num_threads" threads.
|
||||
explicit ThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: env_(env), threads_(num_threads), waiters_(num_threads) {
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait until all scheduled work has finished and then destroy the
|
||||
// set of threads.
|
||||
~ThreadPoolTempl() {
|
||||
{
|
||||
// Wait for all work to get done.
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
while (!pending_.empty()) {
|
||||
empty_.wait(l);
|
||||
}
|
||||
exiting_ = true;
|
||||
|
||||
// Wakeup all waiters.
|
||||
for (auto w : waiters_) {
|
||||
w->ready = true;
|
||||
w->task.f = nullptr;
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for threads to finish.
|
||||
for (auto t : threads_) {
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule fn() for execution in the pool of threads. The functions are
|
||||
// executed in the order in which they are scheduled.
|
||||
void Schedule(std::function<void()> fn) {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
if (waiters_.empty()) {
|
||||
pending_.push_back(std::move(t));
|
||||
} else {
|
||||
Waiter* w = waiters_.back();
|
||||
waiters_.pop_back();
|
||||
w->ready = true;
|
||||
w->task = std::move(t);
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void WorkerLoop() {
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
Waiter w;
|
||||
Task t;
|
||||
while (!exiting_) {
|
||||
if (pending_.empty()) {
|
||||
// Wait for work to be assigned to me
|
||||
w.ready = false;
|
||||
waiters_.push_back(&w);
|
||||
while (!w.ready) {
|
||||
w.cv.wait(l);
|
||||
}
|
||||
t = w.task;
|
||||
w.task.f = nullptr;
|
||||
} else {
|
||||
// Pick up pending work
|
||||
t = std::move(pending_.front());
|
||||
pending_.pop_front();
|
||||
if (pending_.empty()) {
|
||||
empty_.notify_all();
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
mu_.unlock();
|
||||
env_.ExecuteTask(t);
|
||||
t.f = nullptr;
|
||||
mu_.lock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct Waiter {
|
||||
std::condition_variable cv;
|
||||
Task task;
|
||||
bool ready;
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
std::mutex mu_;
|
||||
MaxSizeVector<Thread*> threads_; // All threads
|
||||
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
|
||||
std::deque<Task> pending_; // Queue of pending work
|
||||
std::condition_variable empty_; // Signaled on pending_.empty()
|
||||
bool exiting_ = false;
|
||||
};
|
||||
|
||||
struct StlThreadEnvironment {
|
||||
struct Task {
|
||||
std::function<void()> f;
|
||||
};
|
||||
|
||||
// EnvThread constructor must start the thread,
|
||||
// destructor must join the thread.
|
||||
class EnvThread {
|
||||
public:
|
||||
EnvThread(std::function<void()> f) : thr_(f) {}
|
||||
~EnvThread() { thr_.join(); }
|
||||
|
||||
private:
|
||||
std::thread thr_;
|
||||
};
|
||||
|
||||
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
|
||||
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
|
||||
void ExecuteTask(const Task& t) { t.f(); }
|
||||
};
|
||||
|
||||
typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
|
||||
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
|
||||
// thread pool later.
|
||||
#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
|
||||
typedef NonBlockingThreadPool ThreadPool;
|
||||
#else
|
||||
template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
|
||||
typedef SimpleThreadPool ThreadPool;
|
||||
#endif
|
||||
|
||||
|
||||
// Barrier is an object that allows one or more threads to wait until
|
||||
@ -264,6 +134,15 @@ struct ThreadPoolDevice {
|
||||
return num_threads_;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
return l1CacheSize();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// The l3 cache size is shared between all the cores.
|
||||
return l3CacheSize() / num_threads_;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
// Should return an enum that encodes the ISA supported by the CPU
|
||||
return 1;
|
||||
|
@ -88,10 +88,14 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
typedef TensorEvalToOp<ArgType> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = true,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = true
|
||||
@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
|
||||
@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
// We assume that evalPacket or evalScalar is called to perform the
|
||||
// assignment and account for the cost of the write here.
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
|
||||
|
||||
private:
|
||||
|
@ -101,6 +101,11 @@ struct TensorEvaluator
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
|
||||
|
||||
protected:
|
||||
@ -184,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
|
||||
return loadConstant(m_data+index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
|
||||
|
||||
protected:
|
||||
@ -219,6 +229,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
@ -237,6 +248,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
return m_functor.template packetOp<Index, PacketReturnType>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
|
||||
internal::unpacket_traits<PacketReturnType>::size);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -270,6 +287,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
|
||||
@ -293,6 +311,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
|
||||
return m_argImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -330,6 +354,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
@ -358,6 +383,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
|
||||
return m_leftImpl.costPerCoeff(vectorized) +
|
||||
m_rightImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
@ -398,6 +431,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
|
||||
@ -425,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
internal::Selector<PacketSize> select;
|
||||
for (Index i = 0; i < PacketSize; ++i) {
|
||||
select.select[i] = m_condImpl.coeff(index+i);
|
||||
@ -435,6 +468,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
m_elseImpl.template packet<LoadMode>(index));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return m_condImpl.costPerCoeff(vectorized) +
|
||||
m_thenImpl.costPerCoeff(vectorized)
|
||||
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
|
@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
||||
{
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
||||
// Manually unroll this loop since compilers don't do it.
|
||||
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
||||
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
evaluator.evalPacket(i+PacketSize);
|
||||
evaluator.evalPacket(i+2*PacketSize);
|
||||
evaluator.evalPacket(i+3*PacketSize);
|
||||
}
|
||||
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
||||
|
||||
for (Index i = 0; i < VectorizedSize; i += PacketSize) {
|
||||
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
}
|
||||
for (Index i = VectorizedSize; i < size; ++i) {
|
||||
@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EvalRange {
|
||||
static void run(Evaluator evaluator, const Index first, const Index last) {
|
||||
eigen_assert(last > first);
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
for (Index i = first; i < last; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
@ -88,28 +96,34 @@ struct EvalRange {
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EvalRange<Evaluator, Index, true> {
|
||||
static void run(Evaluator evaluator, const Index first, const Index last) {
|
||||
eigen_assert(last > first);
|
||||
|
||||
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
||||
Evaluator evaluator = *evaluator_in;
|
||||
eigen_assert(last >= first);
|
||||
Index i = first;
|
||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
if (last - first >= PacketSize) {
|
||||
eigen_assert(first % PacketSize == 0);
|
||||
Index lastPacket = last - (last % PacketSize);
|
||||
for (; i < lastPacket; i += PacketSize) {
|
||||
Index last_chunk_offset = last - 4 * PacketSize;
|
||||
// Manually unroll this loop since compilers don't do it.
|
||||
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
evaluator.evalPacket(i+PacketSize);
|
||||
evaluator.evalPacket(i+2*PacketSize);
|
||||
evaluator.evalPacket(i+3*PacketSize);
|
||||
}
|
||||
last_chunk_offset = last - PacketSize;
|
||||
for (; i <= last_chunk_offset; i += PacketSize) {
|
||||
evaluator.evalPacket(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < last; ++i) {
|
||||
evaluator.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
{
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
||||
@ -119,24 +133,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
|
||||
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||
|
||||
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
|
||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
|
||||
|
||||
Barrier barrier(numblocks);
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
|
||||
size_t num_threads = device.numThreads();
|
||||
#ifdef EIGEN_USE_COST_MODEL
|
||||
if (num_threads > 1) {
|
||||
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
||||
}
|
||||
#endif
|
||||
if (num_threads == 1) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
||||
} else {
|
||||
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
||||
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const Index numblocks = size / blocksize;
|
||||
|
||||
if (static_cast<Index>(numblocks) * blocksize < size) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
|
||||
Barrier barrier(numblocks);
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(
|
||||
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
||||
&evaluator, i * blocksize, (i + 1) * blocksize);
|
||||
}
|
||||
if (numblocks * blocksize < size) {
|
||||
EvalRange<Evaluator, Index, Vectorizable>::run(
|
||||
&evaluator, numblocks * blocksize, size);
|
||||
}
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
barrier.Wait();
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
@ -147,98 +171,78 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, false> {
|
||||
template <typename Expression, bool Vectorizable>
|
||||
class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, true> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EigenMetaKernelEval {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
for (Index i = first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
struct EigenMetaKernelEval<Evaluator, Index, true> {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const Index vectorized_size = (last / PacketSize) * PacketSize;
|
||||
const Index vectorized_step_size = step_size * PacketSize;
|
||||
|
||||
// Use the vector path
|
||||
for (Index i = first * PacketSize; i < vectorized_size;
|
||||
i += vectorized_step_size) {
|
||||
eval.evalPacket(i);
|
||||
}
|
||||
for (Index i = vectorized_size + first; i < last; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
|
||||
EigenMetaKernel(Evaluator memcopied_eval, Index size) {
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
|
||||
// complex types such as evaluators we should really conform to the C++
|
||||
// standard and call a proper copy constructor.
|
||||
Evaluator eval(memcopied_eval);
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
// Use the scalar path
|
||||
for (Index i = first_index; i < size; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
|
||||
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
|
||||
// complex types such as evaluators we should really conform to the C++
|
||||
// standard and call a proper copy constructor.
|
||||
Evaluator eval(memcopied_eval);
|
||||
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
// Use the vector path
|
||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const Index vectorized_step_size = step_size * PacketSize;
|
||||
const Index vectorized_size = (size / PacketSize) * PacketSize;
|
||||
for (Index i = first_index * PacketSize; i < vectorized_size;
|
||||
i += vectorized_step_size) {
|
||||
eval.evalPacket(i);
|
||||
}
|
||||
for (Index i = vectorized_size + first_index; i < size; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
||||
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
||||
}
|
||||
|
||||
/*static*/
|
||||
template <typename Expression>
|
||||
EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
template <typename Expression, bool Vectorizable>
|
||||
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
const Expression& expr, const GpuDevice& device) {
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
if (needs_assign) {
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||
|
||||
|
||||
/*static*/
|
||||
template<typename Expression>
|
||||
EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
{
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash if we're called with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||
LAUNCH_CUDA_KERNEL(
|
||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
||||
num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
|
||||
typedef OutputScalar CoeffReturnType;
|
||||
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
if (m_data) {
|
||||
m_device.deallocate(m_data);
|
||||
@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
|
||||
template <int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
|
||||
packet(Index index) const {
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
|
||||
|
||||
|
||||
|
@ -83,10 +83,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
typedef TensorForcedEvalOp<ArgType> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = true,
|
||||
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
|
||||
PacketAccess = (PacketSize > 1),
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
RawAccess = true
|
||||
};
|
||||
@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
|
||||
|
||||
private:
|
||||
|
@ -64,7 +64,7 @@ struct scalar_sigmoid_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
|
||||
const T one = T(1);
|
||||
return one / (one + std::exp(-x));
|
||||
return one / (one + numext::exp(-x));
|
||||
}
|
||||
|
||||
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
@ -158,8 +158,8 @@ template <typename T> struct MeanReducer
|
||||
}
|
||||
|
||||
protected:
|
||||
int scalarCount_;
|
||||
int packetCount_;
|
||||
DenseIndex scalarCount_;
|
||||
DenseIndex packetCount_;
|
||||
};
|
||||
|
||||
template <typename T> struct MaxReducer
|
||||
@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > {
|
||||
template <typename Scalar>
|
||||
struct functor_traits<UniformRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator {
|
||||
template <typename Scalar>
|
||||
struct functor_traits<NormalRandomGenerator<Scalar> > {
|
||||
enum {
|
||||
// Rough estimate.
|
||||
Cost = 100 * NumTraits<Scalar>::MulCost,
|
||||
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
|
||||
};
|
||||
};
|
||||
@ -799,7 +803,7 @@ class GaussianGenerator {
|
||||
T offset = coordinates[i] - m_means[i];
|
||||
tmp += offset * offset / m_two_sigmas[i];
|
||||
}
|
||||
return std::exp(-tmp);
|
||||
return numext::exp(-tmp);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -807,6 +811,15 @@ class GaussianGenerator {
|
||||
array<T, NumDims> m_two_sigmas;
|
||||
};
|
||||
|
||||
template <typename T, typename Index, size_t NumDims>
|
||||
struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
|
||||
enum {
|
||||
Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
|
||||
functor_traits<scalar_quotient_op<T, T> >::Cost) +
|
||||
functor_traits<scalar_exp_op<T> >::Cost,
|
||||
PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool) const {
|
||||
// TODO(rmlarsen): This is just a placeholder. Define interface to make
|
||||
// generators return their cost.
|
||||
return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
|
||||
TensorOpCost::MulCost<Scalar>());
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
|
||||
Device> Self;
|
||||
typedef TensorEvaluator<ArgType, Device> Impl;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + packetSize - 1};
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
@ -434,12 +433,23 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// We conservatively estimate the cost for the code path where the computed
|
||||
// index is inside the original image and
|
||||
// TensorEvaluator<ArgType, Device>::CoordAccess is false.
|
||||
const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
|
||||
6 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::MulCost<Index>();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
|
||||
@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
|
||||
3 * TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
const double input_size = m_impl.dimensions().TotalSize();
|
||||
const double output_size = m_dimensions.TotalSize();
|
||||
if (output_size == 0)
|
||||
return TensorOpCost();
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
|
||||
compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
|
||||
return second;
|
||||
}
|
||||
|
||||
template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
|
||||
template <typename T, typename X, typename Y>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T divup(const X x, const Y y) {
|
||||
return static_cast<T>((x + y - 1) / y);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T divup(const T x, const T y) {
|
||||
return (x + y - 1) / y;
|
||||
return static_cast<T>((x + y - 1) / y);
|
||||
}
|
||||
|
||||
template <size_t n> struct max_n_1 {
|
||||
|
@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
|
||||
return m_impl.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
||||
Scalar* result = m_impl.data();
|
||||
if (result) {
|
||||
|
@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<PaddingDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
|
||||
@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i)
|
||||
updateCostPerDimension(cost, i, i == 0);
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i)
|
||||
updateCostPerDimension(cost, i, i == NumDims - 1);
|
||||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
|
||||
const double in = static_cast<double>(m_impl.dimensions()[i]);
|
||||
const double out = in + m_padding[i].first + m_padding[i].second;
|
||||
if (out == 0)
|
||||
return;
|
||||
const double reduction = in / out;
|
||||
cost *= reduction;
|
||||
if (first) {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
reduction * (1 * TensorOpCost::AddCost<Index>()));
|
||||
} else {
|
||||
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
reduction * (2 * TensorOpCost::MulCost<Index>() +
|
||||
1 * TensorOpCost::DivCost<Index>()));
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index first = index;
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||
const Index lastPaddedRight = m_outputStrides[i+1];
|
||||
@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[0].first;
|
||||
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||
@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
const Index first = index;
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
|
||||
const Index lastPaddedRight = m_outputStrides[i];
|
||||
@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index last = index + PacketSize - 1;
|
||||
const Index first = index;
|
||||
const Index lastPaddedLeft = m_padding[NumDims-1].first;
|
||||
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
|
||||
@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
|
||||
Index indices[2] = {index, index + packetSize - 1};
|
||||
Index indices[2] = {index, index + PacketSize - 1};
|
||||
Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
|
||||
indices[1] / m_outputStrides[output_stride_index]};
|
||||
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
|
||||
@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
|
||||
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
|
||||
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
|
||||
TensorOpCost::MulCost<Index>() +
|
||||
2 * TensorOpCost::AddCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
|
@ -214,7 +214,7 @@ struct FullReducer {
|
||||
|
||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
|
||||
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
*output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
|
||||
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||
}
|
||||
};
|
||||
|
||||
@ -222,18 +222,19 @@ struct FullReducer {
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
// Multithreaded full reducers
|
||||
template <typename Self, typename Op,
|
||||
bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
|
||||
struct FullReducerShard {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
|
||||
typename Self::Index numValuesToReduce, Op& reducer,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
*output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
|
||||
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||
self, firstIndex, numValuesToReduce, reducer);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
||||
// Multithreaded full reducer
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
||||
static const int PacketSize =
|
||||
unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
|
||||
*output = reducer.finalize(reducer.initialize());
|
||||
return;
|
||||
}
|
||||
const std::size_t num_threads = device.numThreads();
|
||||
#ifdef EIGEN_USE_COST_MODEL
|
||||
const TensorOpCost cost =
|
||||
self.m_impl.costPerCoeff(Vectorizable) +
|
||||
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
|
||||
PacketSize);
|
||||
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
||||
num_coeffs, cost, device.numThreads());
|
||||
#else
|
||||
const int num_threads = device.numThreads();
|
||||
#endif
|
||||
if (num_threads == 1) {
|
||||
*output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
|
||||
return;
|
||||
} else {
|
||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
||||
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
|
||||
|
||||
Barrier barrier(numblocks);
|
||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
|
||||
i * blocksize, blocksize, reducer, &shards[i]);
|
||||
}
|
||||
|
||||
typename Self::CoeffReturnType finalShard;
|
||||
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
|
||||
finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
|
||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
|
||||
} else {
|
||||
finalShard = reducer.initialize();
|
||||
}
|
||||
barrier.Wait();
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
reducer.reduce(shards[i], &finalShard);
|
||||
}
|
||||
*output = reducer.finalize(finalShard);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct FullReducer<Self, Op, ThreadPoolDevice, true> {
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful;
|
||||
static const int PacketSize =
|
||||
unpacket_traits<typename Self::PacketReturnType>::size;
|
||||
|
||||
// launch one reducer per thread and accumulate the result.
|
||||
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
typedef typename Self::Index Index;
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
if (num_coeffs == 0) {
|
||||
*output = reducer.finalize(reducer.initialize());
|
||||
*output =
|
||||
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
|
||||
return;
|
||||
}
|
||||
const std::size_t num_threads = device.numThreads();
|
||||
if (num_threads == 1) {
|
||||
*output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
|
||||
return;
|
||||
}
|
||||
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
|
||||
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
|
||||
const Index blocksize =
|
||||
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
|
||||
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
|
||||
eigen_assert(num_coeffs >= numblocks * blocksize);
|
||||
|
||||
Barrier barrier(numblocks);
|
||||
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
|
||||
self, i * blocksize, blocksize, reducer,
|
||||
&shards[i]);
|
||||
}
|
||||
typename Self::CoeffReturnType finalShard;
|
||||
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
|
||||
finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
|
||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
|
||||
if (numblocks * blocksize < num_coeffs) {
|
||||
finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
|
||||
self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
|
||||
reducer);
|
||||
} else {
|
||||
finalShard = reducer.initialize();
|
||||
}
|
||||
|
||||
barrier.Wait();
|
||||
for (unsigned int i = 0; i < numblocks; ++i) {
|
||||
|
||||
for (Index i = 0; i < numblocks; ++i) {
|
||||
reducer.reduce(shards[i], &finalShard);
|
||||
}
|
||||
*output = reducer.finalize(finalShard);
|
||||
@ -411,6 +377,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
|
||||
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -495,8 +464,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static bool size_large_enough(Index total_size) {
|
||||
#ifndef EIGEN_USE_COST_MODEL
|
||||
return total_size > 1024 * 1024;
|
||||
#else
|
||||
return true || total_size;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
@ -504,7 +478,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
// Use the FullReducer if possible.
|
||||
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
|
||||
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
|
||||
(!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
|
||||
(!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
|
||||
|
||||
bool need_assign = false;
|
||||
if (!data) {
|
||||
@ -584,16 +558,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
if (ReducingInnerMostDims) {
|
||||
const Index num_values_to_reduce =
|
||||
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
|
||||
const Index firstIndex = firstInput(index);
|
||||
for (Index i = 0; i < packetSize; ++i) {
|
||||
for (Index i = 0; i < PacketSize; ++i) {
|
||||
Op reducer(m_reducer);
|
||||
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
|
||||
num_values_to_reduce, reducer);
|
||||
@ -602,18 +575,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
const Index firstIndex = firstInput(index);
|
||||
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
|
||||
// TBD: extend this the the n innermost dimensions that we preserve.
|
||||
if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
|
||||
if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
|
||||
Op reducer(m_reducer);
|
||||
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
|
||||
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
|
||||
return reducer.finalizePacket(accum);
|
||||
} else {
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index + i);
|
||||
}
|
||||
}
|
||||
@ -621,6 +594,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
||||
return rslt;
|
||||
}
|
||||
|
||||
// Must be called after evalSubExprsIfNeeded().
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
if (RunningFullReduction && m_result) {
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
} else {
|
||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||
const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
|
||||
return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
private:
|
||||
|
@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||
assert(false && "Should only be called on floats");
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
// Don't crash when we're called with an input tensor of size 0.
|
||||
if (num_coeffs == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
||||
return true;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
// It's faster to use the usual code.
|
||||
@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
||||
return true;
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
// It's faster to use the usual code.
|
||||
|
@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<ReverseDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// TODO(ndjaitly): write a better packing routine that uses
|
||||
// local structure.
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
|
||||
values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (m_reverse[i]) {
|
||||
compute_cost += 2 * TensorOpCost::AddCost<Index>();
|
||||
}
|
||||
}
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x) {
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// This code is pilfered from TensorMorphing.h
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
|
||||
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
return rslt;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
|
||||
2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return m_impl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||
@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / m_outputStrides[i];
|
||||
@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
|
||||
return rslt;
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
values[0] = m_impl.coeff(inputIndices[0]);
|
||||
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
@ -209,6 +208,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
|
||||
double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
|
||||
TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>()) +
|
||||
TensorOpCost::MulCost<Index>();
|
||||
if (vectorized) {
|
||||
compute_cost *= 2; // packet() computes two indices
|
||||
}
|
||||
const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
|
||||
return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
|
||||
// Computation is not vectorized per se, but it is done once per packet.
|
||||
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
protected:
|
||||
@ -266,6 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
{
|
||||
@ -275,12 +289,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
||||
|
||||
Index inputIndices[] = {0, 0};
|
||||
Index indices[] = {index, index + packetSize - 1};
|
||||
Index indices[] = {index, index + PacketSize - 1};
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx0 = indices[0] / this->m_outputStrides[i];
|
||||
@ -304,15 +317,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
|
||||
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
|
||||
}
|
||||
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
|
||||
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
|
||||
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
|
||||
}
|
||||
else {
|
||||
EIGEN_ALIGN_MAX Scalar values[packetSize];
|
||||
EIGEN_ALIGN_MAX Scalar values[PacketSize];
|
||||
internal::pstore<Scalar, PacketReturnType>(values, x);
|
||||
this->m_impl.coeffRef(inputIndices[0]) = values[0];
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
|
||||
for (int i = 1; i < packetSize-1; ++i) {
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
|
||||
for (int i = 1; i < PacketSize-1; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
@ -53,9 +53,7 @@ struct TensorUInt128
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
explicit TensorUInt128(const T& x) : high(0), low(x) {
|
||||
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT;
|
||||
typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow;
|
||||
eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
|
||||
eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
|
||||
eigen_assert(x >= 0);
|
||||
}
|
||||
|
||||
|
@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
static const int NumDims = NumInputDims + 1;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
|
||||
@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
|
||||
{
|
||||
const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
|
||||
m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
const Index indices[2] = {index, index + packetSize - 1};
|
||||
const Index indices[2] = {index, index + PacketSize - 1};
|
||||
const Index patchIndex = indices[0] / m_fastPatchStride;
|
||||
if (patchIndex != indices[1] / m_fastPatchStride) {
|
||||
return packetWithPossibleZero(index);
|
||||
@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
return packetWithPossibleZero(index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
const double compute_cost =
|
||||
10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
|
||||
8 * TensorOpCost::AddCost<Index>();
|
||||
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
@ -518,9 +525,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||
{
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
|
||||
for (int i = 0; i < PacketSize; ++i) {
|
||||
values[i] = coeff(index+i);
|
||||
}
|
||||
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
|
||||
|
6
unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
Normal file
6
unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
|
||||
|
||||
INSTALL(FILES
|
||||
${Eigen_CXX11_ThreadPool_SRCS}
|
||||
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
|
||||
)
|
234
unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
Normal file
234
unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
Normal file
@ -0,0 +1,234 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
||||
#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// EventCount allows to wait for arbitrary predicates in non-blocking
|
||||
// algorithms. Think of condition variable, but wait predicate does not need to
|
||||
// be protected by a mutex. Usage:
|
||||
// Waiting thread does:
|
||||
//
|
||||
// if (predicate)
|
||||
// return act();
|
||||
// EventCount::Waiter& w = waiters[my_index];
|
||||
// ec.Prewait(&w);
|
||||
// if (predicate) {
|
||||
// ec.CancelWait(&w);
|
||||
// return act();
|
||||
// }
|
||||
// ec.CommitWait(&w);
|
||||
//
|
||||
// Notifying thread does:
|
||||
//
|
||||
// predicate = true;
|
||||
// ec.Notify(true);
|
||||
//
|
||||
// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
|
||||
// cheap, but they are executed only if the preceeding predicate check has
|
||||
// failed.
|
||||
//
|
||||
// Algorihtm outline:
|
||||
// There are two main variables: predicate (managed by user) and state_.
|
||||
// Operation closely resembles Dekker mutual algorithm:
|
||||
// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
|
||||
// Waiting thread sets state_ then checks predicate, Notifying thread sets
|
||||
// predicate then checks state_. Due to seq_cst fences in between these
|
||||
// operations it is guaranteed than either waiter will see predicate change
|
||||
// and won't block, or notifying thread will see state_ change and will unblock
|
||||
// the waiter, or both. But it can't happen that both threads don't see each
|
||||
// other changes, which would lead to deadlock.
|
||||
class EventCount {
|
||||
public:
|
||||
class Waiter;
|
||||
|
||||
EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
|
||||
eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
|
||||
// Initialize epoch to something close to overflow to test overflow.
|
||||
state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
|
||||
}
|
||||
|
||||
~EventCount() {
|
||||
// Ensure there are no waiters.
|
||||
eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
|
||||
}
|
||||
|
||||
// Prewait prepares for waiting.
|
||||
// After calling this function the thread must re-check the wait predicate
|
||||
// and call either CancelWait or CommitWait passing the same Waiter object.
|
||||
void Prewait(Waiter* w) {
|
||||
w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// CommitWait commits waiting.
|
||||
void CommitWait(Waiter* w) {
|
||||
w->state = Waiter::kNotSignaled;
|
||||
// Modification epoch of this waiter.
|
||||
uint64_t epoch =
|
||||
(w->epoch & kEpochMask) +
|
||||
(((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
|
||||
uint64_t state = state_.load(std::memory_order_seq_cst);
|
||||
for (;;) {
|
||||
if (int64_t((state & kEpochMask) - epoch) < 0) {
|
||||
// The preceeding waiter has not decided on its fate. Wait until it
|
||||
// calls either CancelWait or CommitWait, or is notified.
|
||||
EIGEN_THREAD_YIELD();
|
||||
state = state_.load(std::memory_order_seq_cst);
|
||||
continue;
|
||||
}
|
||||
// We've already been notified.
|
||||
if (int64_t((state & kEpochMask) - epoch) > 0) return;
|
||||
// Remove this thread from prewait counter and add it to the waiter list.
|
||||
eigen_assert((state & kWaiterMask) != 0);
|
||||
uint64_t newstate = state - kWaiterInc + kEpochInc;
|
||||
newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
|
||||
if ((state & kStackMask) == kStackMask)
|
||||
w->next.store(nullptr, std::memory_order_relaxed);
|
||||
else
|
||||
w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_release))
|
||||
break;
|
||||
}
|
||||
Park(w);
|
||||
}
|
||||
|
||||
// CancelWait cancels effects of the previous Prewait call.
|
||||
void CancelWait(Waiter* w) {
|
||||
uint64_t epoch =
|
||||
(w->epoch & kEpochMask) +
|
||||
(((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
|
||||
uint64_t state = state_.load(std::memory_order_relaxed);
|
||||
for (;;) {
|
||||
if (int64_t((state & kEpochMask) - epoch) < 0) {
|
||||
// The preceeding waiter has not decided on its fate. Wait until it
|
||||
// calls either CancelWait or CommitWait, or is notified.
|
||||
EIGEN_THREAD_YIELD();
|
||||
state = state_.load(std::memory_order_relaxed);
|
||||
continue;
|
||||
}
|
||||
// We've already been notified.
|
||||
if (int64_t((state & kEpochMask) - epoch) > 0) return;
|
||||
// Remove this thread from prewait counter.
|
||||
eigen_assert((state & kWaiterMask) != 0);
|
||||
if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
|
||||
std::memory_order_relaxed))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify wakes one or all waiting threads.
|
||||
// Must be called after changing the associated wait predicate.
|
||||
void Notify(bool all) {
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
uint64_t state = state_.load(std::memory_order_acquire);
|
||||
for (;;) {
|
||||
// Easy case: no waiters.
|
||||
if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
|
||||
return;
|
||||
uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
||||
uint64_t newstate;
|
||||
if (all) {
|
||||
// Reset prewait counter and empty wait list.
|
||||
newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
|
||||
} else if (waiters) {
|
||||
// There is a thread in pre-wait state, unblock it.
|
||||
newstate = state + kEpochInc - kWaiterInc;
|
||||
} else {
|
||||
// Pop a waiter from list and unpark it.
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
Waiter* wnext = w->next.load(std::memory_order_relaxed);
|
||||
uint64_t next = kStackMask;
|
||||
if (wnext != nullptr) next = wnext - &waiters_[0];
|
||||
// Note: we don't add kEpochInc here. ABA problem on the lock-free stack
|
||||
// can't happen because a waiter is re-pushed onto the stack only after
|
||||
// it was in the pre-wait state which inevitably leads to epoch
|
||||
// increment.
|
||||
newstate = (state & kEpochMask) + next;
|
||||
}
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_acquire)) {
|
||||
if (!all && waiters) return; // unblocked pre-wait thread
|
||||
if ((state & kStackMask) == kStackMask) return;
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
if (!all) w->next.store(nullptr, std::memory_order_relaxed);
|
||||
Unpark(w);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Waiter {
|
||||
friend class EventCount;
|
||||
std::atomic<Waiter*> next;
|
||||
std::mutex mu;
|
||||
std::condition_variable cv;
|
||||
uint64_t epoch;
|
||||
unsigned state;
|
||||
enum {
|
||||
kNotSignaled,
|
||||
kWaiting,
|
||||
kSignaled,
|
||||
};
|
||||
// Prevent false sharing with other Waiter objects in the same vector.
|
||||
char pad_[128];
|
||||
};
|
||||
|
||||
private:
|
||||
// State_ layout:
|
||||
// - low kStackBits is a stack of waiters committed wait.
|
||||
// - next kWaiterBits is count of waiters in prewait state.
|
||||
// - next kEpochBits is modification counter.
|
||||
static const uint64_t kStackBits = 16;
|
||||
static const uint64_t kStackMask = (1ull << kStackBits) - 1;
|
||||
static const uint64_t kWaiterBits = 16;
|
||||
static const uint64_t kWaiterShift = 16;
|
||||
static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
|
||||
<< kWaiterShift;
|
||||
static const uint64_t kWaiterInc = 1ull << kWaiterBits;
|
||||
static const uint64_t kEpochBits = 32;
|
||||
static const uint64_t kEpochShift = 32;
|
||||
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
|
||||
static const uint64_t kEpochInc = 1ull << kEpochShift;
|
||||
std::atomic<uint64_t> state_;
|
||||
std::vector<Waiter>& waiters_;
|
||||
|
||||
void Park(Waiter* w) {
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
while (w->state != Waiter::kSignaled) {
|
||||
w->state = Waiter::kWaiting;
|
||||
w->cv.wait(lock);
|
||||
}
|
||||
}
|
||||
|
||||
void Unpark(Waiter* waiters) {
|
||||
Waiter* next = nullptr;
|
||||
for (Waiter* w = waiters; w; w = next) {
|
||||
next = w->next.load(std::memory_order_relaxed);
|
||||
unsigned state;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
state = w->state;
|
||||
w->state = Waiter::kSignaled;
|
||||
}
|
||||
// Avoid notifying if it wasn't waiting.
|
||||
if (state == Waiter::kWaiting) w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
EventCount(const EventCount&) = delete;
|
||||
void operator=(const EventCount&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
|
232
unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
Normal file
232
unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
Normal file
@ -0,0 +1,232 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Environment>
|
||||
class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
||||
public:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef RunQueue<Task, 1024> Queue;
|
||||
|
||||
NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: env_(env),
|
||||
threads_(num_threads),
|
||||
queues_(num_threads),
|
||||
waiters_(num_threads),
|
||||
blocked_(),
|
||||
spinning_(),
|
||||
done_(),
|
||||
ec_(waiters_) {
|
||||
for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
|
||||
for (int i = 0; i < num_threads; i++)
|
||||
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
|
||||
}
|
||||
|
||||
~NonBlockingThreadPoolTempl() {
|
||||
done_.store(true, std::memory_order_relaxed);
|
||||
// Now if all threads block without work, they will start exiting.
|
||||
// But note that threads can continue to work arbitrary long,
|
||||
// block, submit new work, unblock and otherwise live full life.
|
||||
ec_.Notify(true);
|
||||
|
||||
// Join threads explicitly to avoid destruction order issues.
|
||||
for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
|
||||
for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
|
||||
}
|
||||
|
||||
void Schedule(std::function<void()> fn) {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
PerThread* pt = GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
// Worker thread of this pool, push onto the thread's queue.
|
||||
Queue* q = queues_[pt->index];
|
||||
t = q->PushFront(std::move(t));
|
||||
} else {
|
||||
// A free-standing thread (or worker of another pool), push onto a random
|
||||
// queue.
|
||||
Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
|
||||
t = q->PushBack(std::move(t));
|
||||
}
|
||||
// Note: below we touch this after making w available to worker threads.
|
||||
// Strictly speaking, this can lead to a racy-use-after-free. Consider that
|
||||
// Schedule is called from a thread that is neither main thread nor a worker
|
||||
// thread of this pool. Then, execution of w directly or indirectly
|
||||
// completes overall computations, which in turn leads to destruction of
|
||||
// this. We expect that such scenario is prevented by program, that is,
|
||||
// this is kept alive while any threads can potentially be in Schedule.
|
||||
if (!t.f)
|
||||
ec_.Notify(false);
|
||||
else
|
||||
env_.ExecuteTask(t); // Push failed, execute directly.
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct PerThread {
|
||||
bool inited;
|
||||
NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
|
||||
unsigned index; // Worker thread index in pool.
|
||||
unsigned rand; // Random generator state.
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
MaxSizeVector<Thread*> threads_;
|
||||
MaxSizeVector<Queue*> queues_;
|
||||
std::vector<EventCount::Waiter> waiters_;
|
||||
std::atomic<unsigned> blocked_;
|
||||
std::atomic<bool> spinning_;
|
||||
std::atomic<bool> done_;
|
||||
EventCount ec_;
|
||||
|
||||
// Main worker thread loop.
|
||||
void WorkerLoop(unsigned index) {
|
||||
PerThread* pt = GetPerThread();
|
||||
pt->pool = this;
|
||||
pt->index = index;
|
||||
Queue* q = queues_[index];
|
||||
EventCount::Waiter* waiter = &waiters_[index];
|
||||
std::vector<Task> stolen;
|
||||
for (;;) {
|
||||
Task t;
|
||||
if (!stolen.empty()) {
|
||||
t = std::move(stolen.back());
|
||||
stolen.pop_back();
|
||||
}
|
||||
if (!t.f) t = q->PopFront();
|
||||
if (!t.f) {
|
||||
if (Steal(&stolen)) {
|
||||
t = std::move(stolen.back());
|
||||
stolen.pop_back();
|
||||
while (stolen.size()) {
|
||||
Task t1 = q->PushFront(std::move(stolen.back()));
|
||||
stolen.pop_back();
|
||||
if (t1.f) {
|
||||
// There is not much we can do in this case. Just execute the
|
||||
// remaining directly.
|
||||
stolen.push_back(std::move(t1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
env_.ExecuteTask(t);
|
||||
continue;
|
||||
}
|
||||
// Leave one thread spinning. This reduces latency.
|
||||
if (!spinning_ && !spinning_.exchange(true)) {
|
||||
bool nowork = true;
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
if (!OutOfWork()) {
|
||||
nowork = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spinning_ = false;
|
||||
if (!nowork) continue;
|
||||
}
|
||||
if (!WaitForWork(waiter)) return;
|
||||
}
|
||||
}
|
||||
|
||||
// Steal tries to steal work from other worker threads in best-effort manner.
|
||||
bool Steal(std::vector<Task>* stolen) {
|
||||
if (queues_.size() == 1) return false;
|
||||
PerThread* pt = GetPerThread();
|
||||
unsigned lastq = pt->index;
|
||||
for (unsigned i = queues_.size(); i > 0; i--) {
|
||||
unsigned victim = Rand(&pt->rand) % queues_.size();
|
||||
if (victim == lastq && queues_.size() > 2) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
// Steal half of elements from a victim queue.
|
||||
// It is typical to steal just one element, but that assumes that work is
|
||||
// recursively subdivided in halves so that the stolen element is exactly
|
||||
// half of work. If work elements are equally-sized, then is makes sense
|
||||
// to steal half of elements at once and then work locally for a while.
|
||||
if (queues_[victim]->PopBackHalf(stolen)) return true;
|
||||
lastq = victim;
|
||||
}
|
||||
// Just to make sure that we did not miss anything.
|
||||
for (unsigned i = queues_.size(); i > 0; i--)
|
||||
if (queues_[i - 1]->PopBackHalf(stolen)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// WaitForWork blocks until new work is available, or if it is time to exit.
|
||||
bool WaitForWork(EventCount::Waiter* waiter) {
|
||||
// We already did best-effort emptiness check in Steal, so prepare blocking.
|
||||
ec_.Prewait(waiter);
|
||||
// Now do reliable emptiness check.
|
||||
if (!OutOfWork()) {
|
||||
ec_.CancelWait(waiter);
|
||||
return true;
|
||||
}
|
||||
// Number of blocked threads is used as termination condition.
|
||||
// If we are shutting down and all worker threads blocked without work,
|
||||
// that's we are done.
|
||||
blocked_++;
|
||||
if (done_ && blocked_ == threads_.size()) {
|
||||
ec_.CancelWait(waiter);
|
||||
// Almost done, but need to re-check queues.
|
||||
// Consider that all queues are empty and all worker threads are preempted
|
||||
// right after incrementing blocked_ above. Now a free-standing thread
|
||||
// submits work and calls destructor (which sets done_). If we don't
|
||||
// re-check queues, we will exit leaving the work unexecuted.
|
||||
if (!OutOfWork()) {
|
||||
// Note: we must not pop from queues before we decrement blocked_,
|
||||
// otherwise the following scenario is possible. Consider that instead
|
||||
// of checking for emptiness we popped the only element from queues.
|
||||
// Now other worker threads can start exiting, which is bad if the
|
||||
// work item submits other work. So we just check emptiness here,
|
||||
// which ensures that all worker threads exit at the same time.
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
// Reached stable termination state.
|
||||
ec_.Notify(true);
|
||||
return false;
|
||||
}
|
||||
ec_.CommitWait(waiter);
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OutOfWork() {
|
||||
for (unsigned i = 0; i < queues_.size(); i++)
|
||||
if (!queues_[i]->Empty()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
PerThread* GetPerThread() {
|
||||
EIGEN_THREAD_LOCAL PerThread per_thread_;
|
||||
PerThread* pt = &per_thread_;
|
||||
if (pt->inited) return pt;
|
||||
pt->inited = true;
|
||||
pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
|
||||
return pt;
|
||||
}
|
||||
|
||||
static unsigned Rand(unsigned* state) {
|
||||
return *state = *state * 1103515245 + 12345;
|
||||
}
|
||||
};
|
||||
|
||||
typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
210
unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
Normal file
210
unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
Normal file
@ -0,0 +1,210 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
||||
#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// RunQueue is a fixed-size, partially non-blocking deque or Work items.
|
||||
// Operations on front of the queue must be done by a single thread (owner),
|
||||
// operations on back of the queue can be done by multiple threads concurrently.
|
||||
//
|
||||
// Algorithm outline:
|
||||
// All remote threads operating on the queue back are serialized by a mutex.
|
||||
// This ensures that at most two threads access state: owner and one remote
|
||||
// thread (Size aside). The algorithm ensures that the occupied region of the
|
||||
// underlying array is logically continuous (can wraparound, but no stray
|
||||
// occupied elements). Owner operates on one end of this region, remote thread
|
||||
// operates on the other end. Synchronization between these threads
|
||||
// (potential consumption of the last element and take up of the last empty
|
||||
// element) happens by means of state variable in each element. States are:
|
||||
// empty, busy (in process of insertion of removal) and ready. Threads claim
|
||||
// elements (empty->busy and ready->busy transitions) by means of a CAS
|
||||
// operation. The finishing transition (busy->empty and busy->ready) are done
|
||||
// with plain store as the element is exclusively owned by the current thread.
|
||||
//
|
||||
// Note: we could permit only pointers as elements, then we would not need
|
||||
// separate state variable as null/non-null pointer value would serve as state,
|
||||
// but that would require malloc/free per operation for large, complex values
|
||||
// (and this is designed to store std::function<()>).
|
||||
template <typename Work, unsigned kSize>
|
||||
class RunQueue {
|
||||
public:
|
||||
RunQueue() : front_(), back_() {
|
||||
// require power-of-two for fast masking
|
||||
eigen_assert((kSize & (kSize - 1)) == 0);
|
||||
eigen_assert(kSize > 2); // why would you do this?
|
||||
eigen_assert(kSize <= (64 << 10)); // leave enough space for counter
|
||||
for (unsigned i = 0; i < kSize; i++)
|
||||
array_[i].state.store(kEmpty, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
~RunQueue() { eigen_assert(Size() == 0); }
|
||||
|
||||
// PushFront inserts w at the beginning of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushFront(Work w) {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[front & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopFront removes and returns the first element in the queue.
|
||||
// If the queue was empty returns default-constructed Work.
|
||||
Work PopFront() {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(front - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
front = ((front - 1) & kMask2) | (front & ~kMask2);
|
||||
front_.store(front, std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PushBack adds w at the end of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushBack(Work w) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(back - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
back = ((back - 1) & kMask2) | (back & ~kMask2);
|
||||
back_.store(back, std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopBack removes and returns the last elements in the queue.
|
||||
// Can fail spuriously.
|
||||
Work PopBack() {
|
||||
if (Empty()) return 0;
|
||||
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
|
||||
if (!lock) return Work();
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[back & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PopBackHalf removes and returns half last elements in the queue.
|
||||
// Returns number of elements removed. But can also fail spuriously.
|
||||
unsigned PopBackHalf(std::vector<Work>* result) {
|
||||
if (Empty()) return 0;
|
||||
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
|
||||
if (!lock) return 0;
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
unsigned size = Size();
|
||||
unsigned mid = back;
|
||||
if (size > 1) mid = back + (size - 1) / 2;
|
||||
unsigned n = 0;
|
||||
unsigned start = 0;
|
||||
for (; static_cast<int>(mid - back) >= 0; mid--) {
|
||||
Elem* e = &array_[mid & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (n == 0) {
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy,
|
||||
std::memory_order_acquire))
|
||||
continue;
|
||||
start = mid;
|
||||
} else {
|
||||
// Note: no need to store temporal kBusy, we exclusively own these
|
||||
// elements.
|
||||
eigen_assert(s == kReady);
|
||||
}
|
||||
result->push_back(std::move(e->w));
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
n++;
|
||||
}
|
||||
if (n != 0)
|
||||
back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return n;
|
||||
}
|
||||
|
||||
// Size returns current queue size.
|
||||
// Can be called by any thread at any time.
|
||||
unsigned Size() const {
|
||||
// Emptiness plays critical role in thread pool blocking. So we go to great
|
||||
// effort to not produce false positives (claim non-empty queue as empty).
|
||||
for (;;) {
|
||||
// Capture a consistent snapshot of front/tail.
|
||||
unsigned front = front_.load(std::memory_order_acquire);
|
||||
unsigned back = back_.load(std::memory_order_acquire);
|
||||
unsigned front1 = front_.load(std::memory_order_relaxed);
|
||||
if (front != front1) continue;
|
||||
int size = (front & kMask2) - (back & kMask2);
|
||||
// Fix overflow.
|
||||
if (size < 0) size += 2 * kSize;
|
||||
// Order of modification in push/pop is crafted to make the queue look
|
||||
// larger than it is during concurrent modifications. E.g. pop can
|
||||
// decrement size before the corresponding push has incremented it.
|
||||
// So the computed size can be up to kSize + 1, fix it.
|
||||
if (size > static_cast<int>(kSize)) size = kSize;
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
||||
// Empty tests whether container is empty.
|
||||
// Can be called by any thread at any time.
|
||||
bool Empty() const { return Size() == 0; }
|
||||
|
||||
private:
|
||||
static const unsigned kMask = kSize - 1;
|
||||
static const unsigned kMask2 = (kSize << 1) - 1;
|
||||
struct Elem {
|
||||
std::atomic<uint8_t> state;
|
||||
Work w;
|
||||
};
|
||||
enum {
|
||||
kEmpty,
|
||||
kBusy,
|
||||
kReady,
|
||||
};
|
||||
std::mutex mutex_;
|
||||
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
|
||||
// front/back, repsectively. The remaining bits contain modification counters
|
||||
// that are incremented on Push operations. This allows us to (1) distinguish
|
||||
// between empty and full conditions (if we would use log(kSize) bits for
|
||||
// position, these conditions would be indistinguishable); (2) obtain
|
||||
// consistent snapshot of front_/back_ for Size operation using the
|
||||
// modification counters.
|
||||
std::atomic<unsigned> front_;
|
||||
std::atomic<unsigned> back_;
|
||||
Elem array_[kSize];
|
||||
|
||||
RunQueue(const RunQueue&) = delete;
|
||||
void operator=(const RunQueue&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
|
127
unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
Normal file
127
unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
Normal file
@ -0,0 +1,127 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// The implementation of the ThreadPool type ensures that the Schedule method
|
||||
// runs the functions it is provided in FIFO order when the scheduling is done
|
||||
// by a single thread.
|
||||
// Environment provides a way to create threads and also allows to intercept
|
||||
// task submission and execution.
|
||||
template <typename Environment>
|
||||
class SimpleThreadPoolTempl : public ThreadPoolInterface {
|
||||
public:
|
||||
// Construct a pool that contains "num_threads" threads.
|
||||
explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: env_(env), threads_(num_threads), waiters_(num_threads) {
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait until all scheduled work has finished and then destroy the
|
||||
// set of threads.
|
||||
~SimpleThreadPoolTempl() {
|
||||
{
|
||||
// Wait for all work to get done.
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
while (!pending_.empty()) {
|
||||
empty_.wait(l);
|
||||
}
|
||||
exiting_ = true;
|
||||
|
||||
// Wakeup all waiters.
|
||||
for (auto w : waiters_) {
|
||||
w->ready = true;
|
||||
w->task.f = nullptr;
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for threads to finish.
|
||||
for (auto t : threads_) {
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule fn() for execution in the pool of threads. The functions are
|
||||
// executed in the order in which they are scheduled.
|
||||
void Schedule(std::function<void()> fn) {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
if (waiters_.empty()) {
|
||||
pending_.push_back(std::move(t));
|
||||
} else {
|
||||
Waiter* w = waiters_.back();
|
||||
waiters_.pop_back();
|
||||
w->ready = true;
|
||||
w->task = std::move(t);
|
||||
w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void WorkerLoop() {
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
Waiter w;
|
||||
Task t;
|
||||
while (!exiting_) {
|
||||
if (pending_.empty()) {
|
||||
// Wait for work to be assigned to me
|
||||
w.ready = false;
|
||||
waiters_.push_back(&w);
|
||||
while (!w.ready) {
|
||||
w.cv.wait(l);
|
||||
}
|
||||
t = w.task;
|
||||
w.task.f = nullptr;
|
||||
} else {
|
||||
// Pick up pending work
|
||||
t = std::move(pending_.front());
|
||||
pending_.pop_front();
|
||||
if (pending_.empty()) {
|
||||
empty_.notify_all();
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
mu_.unlock();
|
||||
env_.ExecuteTask(t);
|
||||
t.f = nullptr;
|
||||
mu_.lock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct Waiter {
|
||||
std::condition_variable cv;
|
||||
Task task;
|
||||
bool ready;
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
std::mutex mu_;
|
||||
MaxSizeVector<Thread*> threads_; // All threads
|
||||
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
|
||||
std::deque<Task> pending_; // Queue of pending work
|
||||
std::condition_variable empty_; // Signaled on pending_.empty()
|
||||
bool exiting_ = false;
|
||||
};
|
||||
|
||||
typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
|
38
unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
Normal file
38
unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
Normal file
@ -0,0 +1,38 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
struct StlThreadEnvironment {
|
||||
struct Task {
|
||||
std::function<void()> f;
|
||||
};
|
||||
|
||||
// EnvThread constructor must start the thread,
|
||||
// destructor must join the thread.
|
||||
class EnvThread {
|
||||
public:
|
||||
EnvThread(std::function<void()> f) : thr_(f) {}
|
||||
~EnvThread() { thr_.join(); }
|
||||
|
||||
private:
|
||||
std::thread thr_;
|
||||
};
|
||||
|
||||
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
|
||||
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
|
||||
void ExecuteTask(const Task& t) { t.f(); }
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
22
unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
Normal file
22
unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
Normal file
@ -0,0 +1,22 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
|
||||
// Try to come up with a portable implementation of thread local variables
|
||||
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
|
||||
#define EIGEN_THREAD_LOCAL static __thread
|
||||
#elif EIGEN_COMP_CLANG
|
||||
#define EIGEN_THREAD_LOCAL static __thread
|
||||
#else
|
||||
#define EIGEN_THREAD_LOCAL static thread_local
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
26
unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
Normal file
26
unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
Normal file
@ -0,0 +1,26 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// This defines an interface that ThreadPoolDevice can take to use
|
||||
// custom thread pools underneath.
|
||||
class ThreadPoolInterface {
|
||||
public:
|
||||
virtual void Schedule(std::function<void()> fn) = 0;
|
||||
|
||||
virtual ~ThreadPoolInterface() {}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
20
unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
Normal file
20
unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
Normal file
@ -0,0 +1,20 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
|
||||
// Try to come up with a portable way to yield
|
||||
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
|
||||
#define EIGEN_THREAD_YIELD() sched_yield()
|
||||
#else
|
||||
#define EIGEN_THREAD_YIELD() std::this_thread::yield()
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
6
unsupported/Eigen/CXX11/src/util/CMakeLists.txt
Normal file
6
unsupported/Eigen/CXX11/src/util/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
|
||||
|
||||
INSTALL(FILES
|
||||
${Eigen_CXX11_util_SRCS}
|
||||
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
|
||||
)
|
@ -10,12 +10,22 @@
|
||||
#ifndef EIGEN_CXX11META_H
|
||||
#define EIGEN_CXX11META_H
|
||||
|
||||
#include <vector>
|
||||
#include "EmulateArray.h"
|
||||
|
||||
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
|
||||
// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
|
||||
// supports enough of the standard for our needs
|
||||
#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
|
||||
|
||||
#include "CXX11Workarounds.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \internal
|
||||
* \file CXX11/Core/util/CXX11Meta.h
|
||||
* \file CXX11/util/CXX11Meta.h
|
||||
* This file contains generic metaprogramming classes which are not specifically related to Eigen.
|
||||
* This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
|
||||
*/
|
||||
@ -523,4 +533,10 @@ InstType instantiate_by_c_array(ArrType* arr)
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#else // Non C++11, fallback to emulation mode
|
||||
|
||||
#include "src/Core/util/EmulateCXX11Meta.h"
|
||||
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11META_H
|
@ -222,7 +222,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
|
||||
|
||||
#else
|
||||
|
||||
// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array
|
||||
// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
|
||||
#include <array>
|
||||
namespace Eigen {
|
||||
|
||||
@ -264,8 +264,4 @@ template<class T, std::size_t N> struct array_size<std::array<T,N> > {
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif // EIGEN_EMULATE_ARRAY_H
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user