Merged eigen/eigen into default

This commit is contained in:
Konstantinos Margaritis 2016-04-21 18:03:08 +03:00
commit e5b2ef47d5
27 changed files with 366 additions and 283 deletions

View File

@ -204,7 +204,7 @@
#endif #endif
#endif #endif
#if defined(__F16C__) #if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
// We can use the optimized fp16 to float and float to fp16 conversion routines // We can use the optimized fp16 to float and float to fp16 conversion routines
#define EIGEN_HAS_FP16_C #define EIGEN_HAS_FP16_C
#endif #endif
@ -214,10 +214,14 @@
#include <vector_types.h> #include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#define EIGEN_HAS_CUDA_FP16 #define EIGEN_HAS_CUDA_FP16
#include <cuda_fp16.h>
#endif #endif
#endif #endif
#if defined EIGEN_HAS_CUDA_FP16
#include <host_defines.h>
#include <cuda_fp16.h>
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP #define EIGEN_HAS_OPENMP
#endif #endif

View File

@ -81,6 +81,8 @@ public:
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
// FIXME I'm not sure the current mapping is the ideal one. // FIXME I'm not sure the current mapping is the ideal one.
template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; }; template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; };
template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; };
template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; }; template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; };

View File

@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const
DerivedCopy copy(derived()); DerivedCopy copy(derived());
enum { enum {
CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
}; };
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>, typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
::ConstSegmentReturnType>::type SegmentWrapper;
Index n = size(); Index n = size();
if(n==1) if(n==1)

View File

@ -11,8 +11,8 @@
#define EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
#endif #endif
/** \internal */ /** \internal */
struct CacheSizes { struct CacheSizes {
CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
int l1CacheSize, l2CacheSize, l3CacheSize; int l1CacheSize, l2CacheSize, l3CacheSize;
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
enum { enum {
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
ksub = Traits::mr * Traits::nr * sizeof(ResScalar), ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
k_mask = -8, kr = 8,
mr = Traits::mr, mr = Traits::mr,
mr_mask = -mr, nr = Traits::nr
nr = Traits::nr,
nr_mask = -nr
}; };
// Increasing k gives us more time to prefetch the content of the "C" // Increasing k gives us more time to prefetch the content of the "C"
// registers. However once the latency is hidden there is no point in // registers. However once the latency is hidden there is no point in
@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
// experimentally). // experimentally).
const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320); const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
if (k_cache < k) { if (k_cache < k) {
k = k_cache & k_mask; k = k_cache - (k_cache % kr);
eigen_internal_assert(k > 0); eigen_internal_assert(k > 0);
} }
@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
if (n_cache <= n_per_thread) { if (n_cache <= n_per_thread) {
// Don't exceed the capacity of the l2 cache. // Don't exceed the capacity of the l2 cache.
eigen_internal_assert(n_cache >= static_cast<Index>(nr)); eigen_internal_assert(n_cache >= static_cast<Index>(nr));
n = n_cache & nr_mask; n = n_cache - (n_cache % nr);
eigen_internal_assert(n > 0); eigen_internal_assert(n > 0);
} else { } else {
n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask); n = (std::min<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
} }
if (l3 > l2) { if (l3 > l2) {
@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
const Index m_per_thread = numext::div_ceil(m, num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads);
if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
m = m_cache & mr_mask; m = m_cache - (m_cache % mr);
eigen_internal_assert(m > 0); eigen_internal_assert(m > 0);
} else { } else {
m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask); m = (std::min<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
} }
} }
} }
@ -156,23 +152,23 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
l2 = 32*1024; l2 = 32*1024;
l3 = 512*1024; l3 = 512*1024;
#endif #endif
// Early return for small problems because the computation below are time consuming for small problems. // Early return for small problems because the computation below are time consuming for small problems.
// Perhaps it would make more sense to consider k*n*m?? // Perhaps it would make more sense to consider k*n*m??
// Note that for very tiny problem, this function should be bypassed anyway // Note that for very tiny problem, this function should be bypassed anyway
// because we use the coefficient-based implementation for them. // because we use the coefficient-based implementation for them.
if((std::max)(k,(std::max)(m,n))<48) if((std::max)(k,(std::max)(m,n))<48)
return; return;
typedef typename Traits::ResScalar ResScalar; typedef typename Traits::ResScalar ResScalar;
enum { enum {
k_peeling = 8, k_peeling = 8,
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
}; };
// ---- 1st level of blocking on L1, yields kc ---- // ---- 1st level of blocking on L1, yields kc ----
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
// We also include a register-level block of the result (mx x nr). // We also include a register-level block of the result (mx x nr).
@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
// while keeping the same number of sweeps over the result. // while keeping the same number of sweeps over the result.
k = (k%max_kc)==0 ? max_kc k = (k%max_kc)==0 ? max_kc
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
} }
// ---- 2nd level of blocking on max(L2,L3), yields nc ---- // ---- 2nd level of blocking on max(L2,L3), yields nc ----
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
// actual_l2 = max(l2, l3/nb_core_sharing_l3) // actual_l2 = max(l2, l3/nb_core_sharing_l3)
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
#else #else
const Index actual_l2 = 1572864; // == 1.5 MB const Index actual_l2 = 1572864; // == 1.5 MB
#endif #endif
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
// The second half is implicitly reserved to access the result and lhs coefficients. // The second half is implicitly reserved to access the result and lhs coefficients.
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful

View File

@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
{ {
general_matrix_matrix_triangular_product<Index, general_matrix_matrix_triangular_product<Index,
RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,

View File

@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
}; };
static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
}; };
template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
{ {
static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
Index size = (std::min)(_rows,_cols); Index size = (std::min)(_rows,_cols);

View File

@ -148,10 +148,14 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast<CD>().eval()*mcd); VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast<CD>().eval()*mcd);
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>()); VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>()); VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>()); VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(), sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>()); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
// Not supported yet: trmm // Not supported yet: trmm
// VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(), sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>()); // VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(), sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());

View File

@ -45,6 +45,8 @@
#include <functional> #include <functional>
#include <memory> #include <memory>
#include "src/ThreadPool/ThreadLocal.h"
#include "src/ThreadPool/ThreadYield.h"
#include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/EventCount.h"
#include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/RunQueue.h"
#include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadPoolInterface.h"

View File

@ -426,116 +426,6 @@ struct TensorContractionEvaluatorBase
buffer, resIncr, alpha); buffer, resIncr, alpha);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
if (m_result != NULL) {
m_device.deallocate(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_result[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
protected:
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
contract_t m_k_strides;
contract_t m_left_contracting_strides;
contract_t m_right_contracting_strides;
bool m_lhs_inner_dim_contiguous;
bool m_rhs_inner_dim_contiguous;
bool m_rhs_inner_dim_reordered;
left_nocontract_t m_i_strides;
right_nocontract_t m_j_strides;
left_nocontract_t m_left_nocontract_strides;
right_nocontract_t m_right_nocontract_strides;
Index m_i_size;
Index m_j_size;
Index m_k_size;
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
};
// evaluator for default device
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
public TensorContractionEvaluatorBase<
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
// Could we use NumDimensions here?
typedef DSizes<Index, NumDims> Dimensions;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) { }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
return;
}
evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
// columns in left side, rows in right side // columns in left side, rows in right side
@ -628,6 +518,116 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
this->m_device.deallocate(blockA); this->m_device.deallocate(blockA);
this->m_device.deallocate(blockB); this->m_device.deallocate(blockB);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
if (m_result != NULL) {
m_device.deallocate(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_result[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
protected:
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
contract_t m_k_strides;
contract_t m_left_contracting_strides;
contract_t m_right_contracting_strides;
bool m_lhs_inner_dim_contiguous;
bool m_rhs_inner_dim_contiguous;
bool m_rhs_inner_dim_reordered;
left_nocontract_t m_i_strides;
right_nocontract_t m_j_strides;
left_nocontract_t m_left_nocontract_strides;
right_nocontract_t m_right_nocontract_strides;
Index m_i_size;
Index m_j_size;
Index m_k_size;
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
};
// evaluator for default device
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
public TensorContractionEvaluatorBase<
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
// Could we use NumDimensions here?
typedef DSizes<Index, NumDims> Dimensions;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) { }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
return;
}
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
}; };
} // end namespace Eigen } // end namespace Eigen

View File

@ -10,9 +10,9 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#if !defined(EIGEN_USE_GPU) //#if !defined(EIGEN_USE_GPU)
#define EIGEN_USE_COST_MODEL //#define EIGEN_USE_COST_MODEL
#endif //#endif
namespace Eigen { namespace Eigen {

View File

@ -291,15 +291,9 @@ struct GpuDevice {
int max_blocks_; int max_blocks_;
}; };
#ifndef __CUDA_ARCH__
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess); assert(cudaGetLastError() == cudaSuccess);
#else
#define LAUNCH_CUDA_KERNEL(kernel, ...) \
{ const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
#endif
// FIXME: Should be device and kernel specific. // FIXME: Should be device and kernel specific.

View File

@ -189,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
return loadConstant(m_data+index); return loadConstant(m_data+index);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
internal::unpacket_traits<PacketReturnType>::size);
}
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
protected: protected:
@ -454,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
template<int LoadMode> template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{ {
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
internal::Selector<PacketSize> select; internal::Selector<PacketSize> select;
for (Index i = 0; i < PacketSize; ++i) { for (Index i = 0; i < PacketSize; ++i) {
select.select[i] = m_condImpl.coeff(index+i); select.select[i] = m_condImpl.coeff(index+i);

View File

@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true>
{ {
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
// Manually unroll this loop since compilers don't do it.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
evaluator.evalPacket(i);
evaluator.evalPacket(i+PacketSize);
evaluator.evalPacket(i+2*PacketSize);
evaluator.evalPacket(i+3*PacketSize);
}
const Index VectorizedSize = (size / PacketSize) * PacketSize; const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
for (Index i = 0; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i); evaluator.evalPacket(i);
} }
for (Index i = VectorizedSize; i < size; ++i) { for (Index i = VectorizedSize; i < size; ++i) {
@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange { struct EvalRange {
static void run(Evaluator evaluator, const Index first, const Index last) { static void run(Evaluator* evaluator_in, const Index first, const Index last) {
eigen_assert(last > first); Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
for (Index i = first; i < last; ++i) { for (Index i = first; i < last; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
} }
@ -88,28 +96,34 @@ struct EvalRange {
template <typename Evaluator, typename Index> template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> { struct EvalRange<Evaluator, Index, true> {
static void run(Evaluator evaluator, const Index first, const Index last) { static void run(Evaluator* evaluator_in, const Index first, const Index last) {
eigen_assert(last > first); Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
Index i = first; Index i = first;
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) { if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0); eigen_assert(first % PacketSize == 0);
Index lastPacket = last - (last % PacketSize); Index last_chunk_offset = last - 4 * PacketSize;
for (; i < lastPacket; i += PacketSize) { // Manually unroll this loop since compilers don't do it.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
evaluator.evalPacket(i);
evaluator.evalPacket(i+PacketSize);
evaluator.evalPacket(i+2*PacketSize);
evaluator.evalPacket(i+3*PacketSize);
}
last_chunk_offset = last - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i); evaluator.evalPacket(i);
} }
} }
for (; i < last; ++i) { for (; i < last; ++i) {
evaluator.evalScalar(i); evaluator.evalScalar(i);
} }
} }
}; };
template<typename Expression, bool Vectorizable> template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
{
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device) static inline void run(const Expression& expr, const ThreadPoolDevice& device)
@ -119,24 +133,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) if (needs_assign)
{ {
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
int num_threads = device.numThreads();
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; #ifdef EIGEN_USE_COST_MODEL
if (num_threads > 1) {
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); size, evaluator.costPerCoeff(Vectorizable), num_threads);
const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
Barrier barrier(numblocks);
for (unsigned int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
} }
#endif
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
if (static_cast<Index>(numblocks) * blocksize < size) { Barrier barrier(numblocks);
EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size); for (int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
&evaluator, i * blocksize, (i + 1) * blocksize);
}
if (numblocks * blocksize < size) {
EvalRange<Evaluator, Index, Vectorizable>::run(
&evaluator, numblocks * blocksize, size);
}
barrier.Wait();
} }
barrier.Wait();
} }
evaluator.cleanup(); evaluator.cleanup();
} }
@ -159,7 +183,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval { struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator eval, Index first, Index last, Index step_size) { void run(Evaluator& eval, Index first, Index last, Index step_size) {
for (Index i = first; i < last; i += step_size) { for (Index i = first; i < last; i += step_size) {
eval.evalScalar(i); eval.evalScalar(i);
} }
@ -169,7 +193,7 @@ struct EigenMetaKernelEval {
template <typename Evaluator, typename Index> template <typename Evaluator, typename Index>
struct EigenMetaKernelEval<Evaluator, Index, true> { struct EigenMetaKernelEval<Evaluator, Index, true> {
static __device__ EIGEN_ALWAYS_INLINE static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator eval, Index first, Index last, Index step_size) { void run(Evaluator& eval, Index first, Index last, Index step_size) {
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const Index vectorized_size = (last / PacketSize) * PacketSize; const Index vectorized_size = (last / PacketSize) * PacketSize;
const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_step_size = step_size * PacketSize;
@ -214,7 +238,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_CUDA_KERNEL( LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
@ -226,7 +250,6 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
#endif // __CUDACC__ #endif // __CUDACC__
#endif // EIGEN_USE_GPU #endif // EIGEN_USE_GPU
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -158,8 +158,8 @@ template <typename T> struct MeanReducer
} }
protected: protected:
int scalarCount_; DenseIndex scalarCount_;
int packetCount_; DenseIndex packetCount_;
}; };
template <typename T> struct MaxReducer template <typename T> struct MaxReducer

View File

@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const { costPerCoeff(bool) const {
// TODO(rmlarsen): This is just a placeholder. Define interface to make // TODO(rmlarsen): This is just a placeholder. Define interface to make
// generators return their cost. // generators return their cost.
return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +

View File

@ -448,7 +448,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
protected: protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{ {
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
for (int i = 0; i < PacketSize; ++i) { for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i); values[i] = coeff(index+i);

View File

@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
return second; return second;
} }
template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
template <typename T, typename X, typename Y>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const X x, const Y y) {
return static_cast<T>((x + y - 1) / y);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const T x, const T y) { T divup(const T x, const T y) {
return (x + y - 1) / y; return static_cast<T>((x + y - 1) / y);
} }
template <size_t n> struct max_n_1 { template <size_t n> struct max_n_1 {

View File

@ -214,7 +214,7 @@ struct FullReducer {
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
*output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer); *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
} }
}; };
@ -222,18 +222,19 @@ struct FullReducer {
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
// Multithreaded full reducers // Multithreaded full reducers
template <typename Self, typename Op, template <typename Self, typename Op,
bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
struct FullReducerShard { struct FullReducerShard {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer, typename Self::Index numValuesToReduce, Op& reducer,
typename Self::CoeffReturnType* output) { typename Self::CoeffReturnType* output) {
*output = InnerMostDimReducer<Self, Op, vectorizable>::reduce( *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, firstIndex, numValuesToReduce, reducer); self, firstIndex, numValuesToReduce, reducer);
} }
}; };
template <typename Self, typename Op> // Multithreaded full reducer
struct FullReducer<Self, Op, ThreadPoolDevice, false> { template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
static const bool HasOptimizedImplementation = !Op::IsStateful; static const bool HasOptimizedImplementation = !Op::IsStateful;
static const int PacketSize = static const int PacketSize =
unpacket_traits<typename Self::PacketReturnType>::size; unpacket_traits<typename Self::PacketReturnType>::size;
@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
*output = reducer.finalize(reducer.initialize()); *output = reducer.finalize(reducer.initialize());
return; return;
} }
const std::size_t num_threads = device.numThreads(); #ifdef EIGEN_USE_COST_MODEL
const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
#else
const int num_threads = device.numThreads();
#endif
if (num_threads == 1) { if (num_threads == 1) {
*output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer); *output =
return; InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
} else {
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (unsigned int i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
i * blocksize, blocksize, reducer, &shards[i]);
}
typename Self::CoeffReturnType finalShard;
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
} else {
finalShard = reducer.initialize();
}
barrier.Wait();
for (unsigned int i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
}
}
};
template <typename Self, typename Op>
struct FullReducer<Self, Op, ThreadPoolDevice, true> {
static const bool HasOptimizedImplementation = !Op::IsStateful;
static const int PacketSize =
unpacket_traits<typename Self::PacketReturnType>::size;
// launch one reducer per thread and accumulate the result.
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());
if (num_coeffs == 0) {
*output = reducer.finalize(reducer.initialize());
return; return;
} }
const std::size_t num_threads = device.numThreads(); const Index blocksize =
if (num_threads == 1) { std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
*output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer); const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
return; eigen_assert(num_coeffs >= numblocks * blocksize);
}
const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
Barrier barrier(numblocks); Barrier barrier(numblocks);
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (unsigned int i = 0; i < numblocks; ++i) { for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run, device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
self, i * blocksize, blocksize, reducer, self, i * blocksize, blocksize, reducer,
&shards[i]); &shards[i]);
} }
typename Self::CoeffReturnType finalShard; typename Self::CoeffReturnType finalShard;
if (static_cast<Index>(numblocks) * blocksize < num_coeffs) { if (numblocks * blocksize < num_coeffs) {
finalShard = InnerMostDimReducer<Self, Op, true>::reduce( finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
reducer);
} else { } else {
finalShard = reducer.initialize(); finalShard = reducer.initialize();
} }
barrier.Wait(); barrier.Wait();
for (unsigned int i = 0; i < numblocks; ++i) {
for (Index i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard); reducer.reduce(shards[i], &finalShard);
} }
*output = reducer.finalize(finalShard); *output = reducer.finalize(finalShard);
@ -498,13 +464,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
static bool size_large_enough(Index total_size) {
#ifndef EIGEN_USE_COST_MODEL
return total_size > 1024 * 1024;
#else
return true || total_size;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL); m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible. // Use the FullReducer if possible.
if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
(!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
bool need_assign = false; bool need_assign = false;
if (!data) { if (!data) {

View File

@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
assert(false && "Should only be called on floats"); assert(false && "Should only be called on floats");
} }
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
typedef typename Self::Index Index; typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions()); const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return;
}
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 128; const int num_per_thread = 128;
const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there // We initialize the outputs outside the reduction kernel when we can't be sure that there
@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true; return true;
} }
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index; typedef typename Self::Index Index;
// It's faster to use the usual code. // It's faster to use the usual code.
@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
return true; return true;
} }
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index; typedef typename Self::Index Index;
// It's faster to use the usual code. // It's faster to use the usual code.

View File

@ -525,7 +525,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
protected: protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{ {
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
for (int i = 0; i < PacketSize; ++i) { for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i); values[i] = coeff(index+i);

View File

@ -81,7 +81,7 @@ class EventCount {
if (int64_t((state & kEpochMask) - epoch) < 0) { if (int64_t((state & kEpochMask) - epoch) < 0) {
// The preceeding waiter has not decided on its fate. Wait until it // The preceeding waiter has not decided on its fate. Wait until it
// calls either CancelWait or CommitWait, or is notified. // calls either CancelWait or CommitWait, or is notified.
std::this_thread::yield(); EIGEN_THREAD_YIELD();
state = state_.load(std::memory_order_seq_cst); state = state_.load(std::memory_order_seq_cst);
continue; continue;
} }
@ -112,7 +112,7 @@ class EventCount {
if (int64_t((state & kEpochMask) - epoch) < 0) { if (int64_t((state & kEpochMask) - epoch) < 0) {
// The preceeding waiter has not decided on its fate. Wait until it // The preceeding waiter has not decided on its fate. Wait until it
// calls either CancelWait or CommitWait, or is notified. // calls either CancelWait or CommitWait, or is notified.
std::this_thread::yield(); EIGEN_THREAD_YIELD();
state = state_.load(std::memory_order_relaxed); state = state_.load(std::memory_order_relaxed);
continue; continue;
} }

View File

@ -212,7 +212,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
} }
PerThread* GetPerThread() { PerThread* GetPerThread() {
static thread_local PerThread per_thread_; EIGEN_THREAD_LOCAL PerThread per_thread_;
PerThread* pt = &per_thread_; PerThread* pt = &per_thread_;
if (pt->inited) return pt; if (pt->inited) return pt;
pt->inited = true; pt->inited = true;

View File

@ -0,0 +1,22 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
// Try to come up with a portable implementation of thread local variables
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
#define EIGEN_THREAD_LOCAL static __thread
#elif EIGEN_COMP_CLANG
#define EIGEN_THREAD_LOCAL static __thread
#else
#define EIGEN_THREAD_LOCAL static thread_local
#endif
#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

View File

@ -0,0 +1,20 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
// Try to come up with a portable way to yield
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
#define EIGEN_THREAD_YIELD() sched_yield()
#else
#define EIGEN_THREAD_YIELD() std::this_thread::yield()
#endif
#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H

View File

@ -12,6 +12,17 @@
#include "main.h" #include "main.h"
#include <Eigen/CXX11/ThreadPool> #include <Eigen/CXX11/ThreadPool>
// Visual studio doesn't implement a rand_r() function since its
// implementation of rand() is already thread safe
int rand_reentrant(unsigned int* s) {
#ifdef EIGEN_COMP_MSVC_STRICT
EIGEN_UNUSED_VARIABLE(s);
return rand();
#else
return rand_r(s);
#endif
}
static void test_basic_eventcount() static void test_basic_eventcount()
{ {
std::vector<EventCount::Waiter> waiters(1); std::vector<EventCount::Waiter> waiters(1);
@ -67,8 +78,8 @@ const int TestQueue::kQueueSize;
static void test_stress_eventcount() static void test_stress_eventcount()
{ {
const int kThreads = std::thread::hardware_concurrency(); const int kThreads = std::thread::hardware_concurrency();
const int kEvents = 1 << 16; static const int kEvents = 1 << 16;
const int kQueues = 10; static const int kQueues = 10;
std::vector<EventCount::Waiter> waiters(kThreads); std::vector<EventCount::Waiter> waiters(kThreads);
EventCount ec(waiters); EventCount ec(waiters);
@ -77,15 +88,15 @@ static void test_stress_eventcount()
std::vector<std::unique_ptr<std::thread>> producers; std::vector<std::unique_ptr<std::thread>> producers;
for (int i = 0; i < kThreads; i++) { for (int i = 0; i < kThreads; i++) {
producers.emplace_back(new std::thread([&ec, &queues]() { producers.emplace_back(new std::thread([&ec, &queues]() {
unsigned rnd = std::hash<std::thread::id>()(std::this_thread::get_id()); unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
for (int i = 0; i < kEvents; i++) { for (int j = 0; j < kEvents; j++) {
unsigned idx = rand_r(&rnd) % kQueues; unsigned idx = rand_reentrant(&rnd) % kQueues;
if (queues[idx].Push()) { if (queues[idx].Push()) {
ec.Notify(false); ec.Notify(false);
continue; continue;
} }
std::this_thread::yield(); std::this_thread::yield();
i--; j--;
} }
})); }));
} }
@ -94,11 +105,11 @@ static void test_stress_eventcount()
for (int i = 0; i < kThreads; i++) { for (int i = 0; i < kThreads; i++) {
consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
EventCount::Waiter& w = waiters[i]; EventCount::Waiter& w = waiters[i];
unsigned rnd = std::hash<std::thread::id>()(std::this_thread::get_id()); unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
for (int i = 0; i < kEvents; i++) { for (int j = 0; j < kEvents; j++) {
unsigned idx = rand_r(&rnd) % kQueues; unsigned idx = rand_reentrant(&rnd) % kQueues;
if (queues[idx].Pop()) continue; if (queues[idx].Pop()) continue;
i--; j--;
ec.Prewait(&w); ec.Prewait(&w);
bool empty = true; bool empty = true;
for (int q = 0; q < kQueues; q++) { for (int q = 0; q < kQueues; q++) {

View File

@ -9,9 +9,22 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_USE_THREADS #define EIGEN_USE_THREADS
#include <cstdlib>
#include "main.h" #include "main.h"
#include <Eigen/CXX11/ThreadPool> #include <Eigen/CXX11/ThreadPool>
// Visual studio doesn't implement a rand_r() function since its
// implementation of rand() is already thread safe
int rand_reentrant(unsigned int* s) {
#ifdef EIGEN_COMP_MSVC_STRICT
EIGEN_UNUSED_VARIABLE(s);
return rand();
#else
return rand_r(s);
#endif
}
void test_basic_runqueue() void test_basic_runqueue()
{ {
RunQueue<int, 4> q; RunQueue<int, 4> q;
@ -105,11 +118,11 @@ void test_empty_runqueue()
unsigned rnd = 0; unsigned rnd = 0;
std::vector<int> stolen; std::vector<int> stolen;
for (int i = 0; i < 1 << 18; i++) { for (int i = 0; i < 1 << 18; i++) {
if (rand_r(&rnd) % 2) if (rand_reentrant(&rnd) % 2)
VERIFY_IS_EQUAL(0, q.PushFront(1)); VERIFY_IS_EQUAL(0, q.PushFront(1));
else else
VERIFY_IS_EQUAL(0, q.PushBack(1)); VERIFY_IS_EQUAL(0, q.PushBack(1));
if (rand_r(&rnd) % 2) if (rand_reentrant(&rnd) % 2)
VERIFY_IS_EQUAL(1, q.PopFront()); VERIFY_IS_EQUAL(1, q.PopFront());
else { else {
for (;;) { for (;;) {
@ -138,7 +151,7 @@ void test_empty_runqueue()
// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. // PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
void test_stress_runqueue() void test_stress_runqueue()
{ {
const int kEvents = 1 << 18; static const int kEvents = 1 << 18;
RunQueue<int, 8> q; RunQueue<int, 8> q;
std::atomic<int> total(0); std::atomic<int> total(0);
std::vector<std::unique_ptr<std::thread>> threads; std::vector<std::unique_ptr<std::thread>> threads;
@ -166,30 +179,30 @@ void test_stress_runqueue()
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
threads.emplace_back(new std::thread([&q, &total]() { threads.emplace_back(new std::thread([&q, &total]() {
int sum = 0; int sum = 0;
for (int i = 1; i < kEvents; i++) { for (int j = 1; j < kEvents; j++) {
if (q.PushBack(i) == 0) { if (q.PushBack(j) == 0) {
sum += i; sum += j;
continue; continue;
} }
std::this_thread::yield(); std::this_thread::yield();
i--; j--;
} }
total += sum; total += sum;
})); }));
threads.emplace_back(new std::thread([&q, &total]() { threads.emplace_back(new std::thread([&q, &total]() {
int sum = 0; int sum = 0;
std::vector<int> stolen; std::vector<int> stolen;
for (int i = 1; i < kEvents;) { for (int j = 1; j < kEvents;) {
if (q.PopBackHalf(&stolen) == 0) { if (q.PopBackHalf(&stolen) == 0) {
std::this_thread::yield(); std::this_thread::yield();
continue; continue;
} }
while (stolen.size() && i < kEvents) { while (stolen.size() && j < kEvents) {
int v = stolen.back(); int v = stolen.back();
stolen.pop_back(); stolen.pop_back();
VERIFY_IS_NOT_EQUAL(v, 0); VERIFY_IS_NOT_EQUAL(v, 0);
sum += v; sum += v;
i++; j++;
} }
} }
while (stolen.size()) { while (stolen.size()) {

View File

@ -20,6 +20,8 @@ static void test_0d()
TensorFixedSize<float, Sizes<> > scalar1; TensorFixedSize<float, Sizes<> > scalar1;
TensorFixedSize<float, Sizes<>, RowMajor> scalar2; TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
VERIFY_IS_EQUAL(scalar1.rank(), 0); VERIFY_IS_EQUAL(scalar1.rank(), 0);
VERIFY_IS_EQUAL(scalar1.size(), 1);
VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
scalar1() = 7.0; scalar1() = 7.0;
scalar2() = 13.0; scalar2() = 13.0;