mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-12 09:23:12 +08:00
Improvements to parallelFor.
Move some scalar functors from TensorFunctors. to Eigen core.
This commit is contained in:
parent
ae9688f313
commit
e55deb21c5
@ -89,13 +89,13 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
|
|||||||
enum {
|
enum {
|
||||||
Conj = NumTraits<LhsScalar>::IsComplex
|
Conj = NumTraits<LhsScalar>::IsComplex
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
|
typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
|
||||||
|
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
|
||||||
{ return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
|
{ return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
|
||||||
|
|
||||||
template<typename Packet>
|
template<typename Packet>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||||
{ return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
|
{ return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
|
||||||
@ -591,6 +591,47 @@ template<typename Scalar>
|
|||||||
struct functor_traits<scalar_inverse_mult_op<Scalar> >
|
struct functor_traits<scalar_inverse_mult_op<Scalar> >
|
||||||
{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
|
{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the modulo between an array and a fixed scalar.
|
||||||
|
*/
|
||||||
|
template <typename Scalar>
|
||||||
|
struct scalar_mod_op {
|
||||||
|
EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
|
||||||
|
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
|
||||||
|
const Scalar m_divisor;
|
||||||
|
};
|
||||||
|
template <typename Scalar>
|
||||||
|
struct functor_traits<scalar_mod_op<Scalar> >
|
||||||
|
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the modulo between two arrays.
|
||||||
|
*/
|
||||||
|
template <typename Scalar>
|
||||||
|
struct scalar_mod2_op {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
|
||||||
|
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
|
||||||
|
};
|
||||||
|
template <typename Scalar>
|
||||||
|
struct functor_traits<scalar_mod2_op<Scalar> >
|
||||||
|
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the float modulo between two arrays.
|
||||||
|
*/
|
||||||
|
template <typename Scalar>
|
||||||
|
struct scalar_fmod_op {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
|
||||||
|
operator()(const Scalar& a, const Scalar& b) const {
|
||||||
|
return numext::fmod(a, b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <typename Scalar>
|
||||||
|
struct functor_traits<scalar_fmod_op<Scalar> > {
|
||||||
|
enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell.
|
||||||
|
PacketAccess = false };
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
@ -496,7 +496,7 @@ struct functor_traits<scalar_digamma_op<Scalar> >
|
|||||||
PacketAccess = packet_traits<Scalar>::HasDiGamma
|
PacketAccess = packet_traits<Scalar>::HasDiGamma
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/** \internal
|
/** \internal
|
||||||
* \brief Template functor to compute the Riemann Zeta function of two arguments.
|
* \brief Template functor to compute the Riemann Zeta function of two arguments.
|
||||||
* \sa class CwiseUnaryOp, Cwise::zeta()
|
* \sa class CwiseUnaryOp, Cwise::zeta()
|
||||||
@ -587,6 +587,33 @@ struct functor_traits<scalar_erfc_op<Scalar> >
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the sigmoid of a scalar
|
||||||
|
* \sa class CwiseUnaryOp, ArrayBase::sigmoid()
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
struct scalar_sigmoid_op {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
|
||||||
|
const T one = T(1);
|
||||||
|
return one / (one + numext::exp(-x));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
Packet packetOp(const Packet& x) const {
|
||||||
|
const Packet one = pset1<Packet>(T(1));
|
||||||
|
return pdiv(one, padd(one, pexp(pnegate(x))));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct functor_traits<scalar_sigmoid_op<T> > {
|
||||||
|
enum {
|
||||||
|
Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
|
||||||
|
PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
|
||||||
|
packet_traits<T>::HasNegate && packet_traits<T>::HasExp
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
/** \internal
|
/** \internal
|
||||||
* \brief Template functor to compute the atan of a scalar
|
* \brief Template functor to compute the atan of a scalar
|
||||||
@ -627,7 +654,7 @@ template<typename Scalar> struct scalar_tanh_op {
|
|||||||
const Packet plus_9 = pset1<Packet>(9.0);
|
const Packet plus_9 = pset1<Packet>(9.0);
|
||||||
const Packet minus_9 = pset1<Packet>(-9.0);
|
const Packet minus_9 = pset1<Packet>(-9.0);
|
||||||
const Packet x = pmax(minus_9, pmin(plus_9, _x));
|
const Packet x = pmax(minus_9, pmin(plus_9, _x));
|
||||||
|
|
||||||
// The monomial coefficients of the numerator polynomial (odd).
|
// The monomial coefficients of the numerator polynomial (odd).
|
||||||
const Packet alpha_1 = pset1<Packet>(4.89352455891786e-03);
|
const Packet alpha_1 = pset1<Packet>(4.89352455891786e-03);
|
||||||
const Packet alpha_3 = pset1<Packet>(6.37261928875436e-04);
|
const Packet alpha_3 = pset1<Packet>(6.37261928875436e-04);
|
||||||
@ -636,16 +663,16 @@ template<typename Scalar> struct scalar_tanh_op {
|
|||||||
const Packet alpha_9 = pset1<Packet>(-8.60467152213735e-11);
|
const Packet alpha_9 = pset1<Packet>(-8.60467152213735e-11);
|
||||||
const Packet alpha_11 = pset1<Packet>(2.00018790482477e-13);
|
const Packet alpha_11 = pset1<Packet>(2.00018790482477e-13);
|
||||||
const Packet alpha_13 = pset1<Packet>(-2.76076847742355e-16);
|
const Packet alpha_13 = pset1<Packet>(-2.76076847742355e-16);
|
||||||
|
|
||||||
// The monomial coefficients of the denominator polynomial (even).
|
// The monomial coefficients of the denominator polynomial (even).
|
||||||
const Packet beta_0 = pset1<Packet>(4.89352518554385e-03);
|
const Packet beta_0 = pset1<Packet>(4.89352518554385e-03);
|
||||||
const Packet beta_2 = pset1<Packet>(2.26843463243900e-03);
|
const Packet beta_2 = pset1<Packet>(2.26843463243900e-03);
|
||||||
const Packet beta_4 = pset1<Packet>(1.18534705686654e-04);
|
const Packet beta_4 = pset1<Packet>(1.18534705686654e-04);
|
||||||
const Packet beta_6 = pset1<Packet>(1.19825839466702e-06);
|
const Packet beta_6 = pset1<Packet>(1.19825839466702e-06);
|
||||||
|
|
||||||
// Since the polynomials are odd/even, we need x^2.
|
// Since the polynomials are odd/even, we need x^2.
|
||||||
const Packet x2 = pmul(x, x);
|
const Packet x2 = pmul(x, x);
|
||||||
|
|
||||||
// Evaluate the numerator polynomial p.
|
// Evaluate the numerator polynomial p.
|
||||||
Packet p = pmadd(x2, alpha_13, alpha_11);
|
Packet p = pmadd(x2, alpha_13, alpha_11);
|
||||||
p = pmadd(x2, p, alpha_9);
|
p = pmadd(x2, p, alpha_9);
|
||||||
@ -654,12 +681,12 @@ template<typename Scalar> struct scalar_tanh_op {
|
|||||||
p = pmadd(x2, p, alpha_3);
|
p = pmadd(x2, p, alpha_3);
|
||||||
p = pmadd(x2, p, alpha_1);
|
p = pmadd(x2, p, alpha_1);
|
||||||
p = pmul(x, p);
|
p = pmul(x, p);
|
||||||
|
|
||||||
// Evaluate the denominator polynomial p.
|
// Evaluate the denominator polynomial p.
|
||||||
Packet q = pmadd(x2, beta_6, beta_4);
|
Packet q = pmadd(x2, beta_6, beta_4);
|
||||||
q = pmadd(x2, q, beta_2);
|
q = pmadd(x2, q, beta_2);
|
||||||
q = pmadd(x2, q, beta_0);
|
q = pmadd(x2, q, beta_0);
|
||||||
|
|
||||||
// Divide the numerator by the denominator.
|
// Divide the numerator by the denominator.
|
||||||
return pdiv(p, q);
|
return pdiv(p, q);
|
||||||
}
|
}
|
||||||
@ -938,7 +965,7 @@ struct scalar_sign_op<Scalar,true> {
|
|||||||
template<typename Scalar>
|
template<typename Scalar>
|
||||||
struct functor_traits<scalar_sign_op<Scalar> >
|
struct functor_traits<scalar_sign_op<Scalar> >
|
||||||
{ enum {
|
{ enum {
|
||||||
Cost =
|
Cost =
|
||||||
NumTraits<Scalar>::IsComplex
|
NumTraits<Scalar>::IsComplex
|
||||||
? ( 8*NumTraits<Scalar>::MulCost ) // roughly
|
? ( 8*NumTraits<Scalar>::MulCost ) // roughly
|
||||||
: ( 3*NumTraits<Scalar>::AddCost),
|
: ( 3*NumTraits<Scalar>::AddCost),
|
||||||
|
@ -69,6 +69,7 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#include "src/Tensor/TensorMacros.h"
|
#include "src/Tensor/TensorMacros.h"
|
||||||
#include "src/Tensor/TensorForwardDeclarations.h"
|
#include "src/Tensor/TensorForwardDeclarations.h"
|
||||||
#include "src/Tensor/TensorMeta.h"
|
#include "src/Tensor/TensorMeta.h"
|
||||||
|
#include "src/Tensor/TensorCostModel.h"
|
||||||
#include "src/Tensor/TensorDeviceDefault.h"
|
#include "src/Tensor/TensorDeviceDefault.h"
|
||||||
#include "src/Tensor/TensorDeviceThreadPool.h"
|
#include "src/Tensor/TensorDeviceThreadPool.h"
|
||||||
#include "src/Tensor/TensorDeviceCuda.h"
|
#include "src/Tensor/TensorDeviceCuda.h"
|
||||||
@ -83,7 +84,6 @@ typedef unsigned __int64 uint64_t;
|
|||||||
|
|
||||||
#include "src/Tensor/TensorBase.h"
|
#include "src/Tensor/TensorBase.h"
|
||||||
|
|
||||||
#include "src/Tensor/TensorCostModel.h"
|
|
||||||
#include "src/Tensor/TensorEvaluator.h"
|
#include "src/Tensor/TensorEvaluator.h"
|
||||||
#include "src/Tensor/TensorExpr.h"
|
#include "src/Tensor/TensorExpr.h"
|
||||||
#include "src/Tensor/TensorReduction.h"
|
#include "src/Tensor/TensorReduction.h"
|
||||||
|
@ -172,67 +172,69 @@ struct ThreadPoolDevice {
|
|||||||
pool_->Schedule(func);
|
pool_->Schedule(func);
|
||||||
}
|
}
|
||||||
|
|
||||||
// parallelFor executes f with [0, size) arguments in parallel and waits for
|
// parallelFor executes f with [0, n) arguments in parallel and waits for
|
||||||
// completion. Block size is choosen between min_block_size and
|
// completion. F accepts a half-open interval [first, last).
|
||||||
// 2 * min_block_size to achieve the best parallel efficiency.
|
// Block size is choosen based on the iteration cost and resulting parallel
|
||||||
// If min_block_size == -1, parallelFor uses block size of 1.
|
// efficiency. If block_align is not nullptr, it is called to round up the
|
||||||
// If hard_align > 0, block size is aligned to hard_align.
|
// block size.
|
||||||
// If soft_align > hard_align, block size is aligned to soft_align provided
|
void parallelFor(Index n, const TensorOpCost& cost,
|
||||||
// that it does not increase block size too much.
|
std::function<Index(Index)> block_align,
|
||||||
void parallelFor(Index size, Index min_block_size, Index hard_align,
|
|
||||||
Index soft_align,
|
|
||||||
std::function<void(Index, Index)> f) const {
|
std::function<void(Index, Index)> f) const {
|
||||||
if (size <= 1 || (min_block_size != -1 && size < min_block_size) ||
|
typedef TensorCostModel<ThreadPoolDevice> CostModel;
|
||||||
numThreads() == 1) {
|
if (n <= 1 || numThreads() == 1 ||
|
||||||
f(0, size);
|
CostModel::numThreads(n, cost, numThreads()) == 1) {
|
||||||
|
f(0, n);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Index block_size = 1;
|
// Calculate block size based on (1) the iteration cost and (2) parallel
|
||||||
Index block_count = size;
|
// efficiency. We want blocks to be not too small to mitigate
|
||||||
if (min_block_size != -1) {
|
// parallelization overheads; not too large to mitigate tail
|
||||||
// Calculate block size based on (1) estimated cost and (2) parallel
|
// effect and potential load imbalance and we also want number
|
||||||
// efficiency. We want blocks to be not too small to mitigate
|
// of blocks to be evenly dividable across threads.
|
||||||
// parallelization overheads; not too large to mitigate tail effect and
|
|
||||||
// potential load imbalance and we also want number of blocks to be evenly
|
double block_size_f = 1.0 / CostModel::taskSize(1, cost);
|
||||||
// dividable across threads.
|
Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
|
||||||
min_block_size = numext::maxi<Index>(min_block_size, 1);
|
const Index max_block_size =
|
||||||
block_size = numext::mini(min_block_size, size);
|
numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
|
||||||
// Upper bound on block size:
|
if (block_align) {
|
||||||
const Index max_block_size = numext::mini(min_block_size * 2, size);
|
Index new_block_size = block_align(block_size);
|
||||||
block_size = numext::mini(
|
eigen_assert(new_block_size >= block_size);
|
||||||
alignBlockSize(block_size, hard_align, soft_align), size);
|
block_size = numext::mini(n, new_block_size);
|
||||||
block_count = divup(size, block_size);
|
}
|
||||||
// Calculate parallel efficiency as fraction of total CPU time used for
|
Index block_count = divup(n, block_size);
|
||||||
// computations:
|
// Calculate parallel efficiency as fraction of total CPU time used for
|
||||||
double max_efficiency =
|
// computations:
|
||||||
static_cast<double>(block_count) /
|
double max_efficiency =
|
||||||
(divup<int>(block_count, numThreads()) * numThreads());
|
static_cast<double>(block_count) /
|
||||||
// Now try to increase block size up to max_block_size as long as it
|
(divup<int>(block_count, numThreads()) * numThreads());
|
||||||
// doesn't decrease parallel efficiency.
|
// Now try to increase block size up to max_block_size as long as it
|
||||||
for (Index prev_block_count = block_count; prev_block_count > 1;) {
|
// doesn't decrease parallel efficiency.
|
||||||
// This is the next block size that divides size into a smaller number
|
for (Index prev_block_count = block_count; prev_block_count > 1;) {
|
||||||
// of blocks than the current block_size.
|
// This is the next block size that divides size into a smaller number
|
||||||
Index coarser_block_size = divup(size, prev_block_count - 1);
|
// of blocks than the current block_size.
|
||||||
coarser_block_size =
|
Index coarser_block_size = divup(n, prev_block_count - 1);
|
||||||
alignBlockSize(coarser_block_size, hard_align, soft_align);
|
if (block_align) {
|
||||||
if (coarser_block_size > max_block_size) {
|
Index new_block_size = block_align(coarser_block_size);
|
||||||
break; // Reached max block size. Stop.
|
eigen_assert(new_block_size >= coarser_block_size);
|
||||||
}
|
coarser_block_size = numext::mini(n, new_block_size);
|
||||||
// Recalculate parallel efficiency.
|
}
|
||||||
const Index coarser_block_count = divup(size, coarser_block_size);
|
if (coarser_block_size > max_block_size) {
|
||||||
eigen_assert(coarser_block_count < prev_block_count);
|
break; // Reached max block size. Stop.
|
||||||
prev_block_count = coarser_block_count;
|
}
|
||||||
const double coarser_efficiency =
|
// Recalculate parallel efficiency.
|
||||||
static_cast<double>(coarser_block_count) /
|
const Index coarser_block_count = divup(n, coarser_block_size);
|
||||||
(divup<int>(coarser_block_count, numThreads()) * numThreads());
|
eigen_assert(coarser_block_count < prev_block_count);
|
||||||
if (coarser_efficiency + 0.01 >= max_efficiency) {
|
prev_block_count = coarser_block_count;
|
||||||
// Taking it.
|
const double coarser_efficiency =
|
||||||
block_size = coarser_block_size;
|
static_cast<double>(coarser_block_count) /
|
||||||
block_count = coarser_block_count;
|
(divup<int>(coarser_block_count, numThreads()) * numThreads());
|
||||||
if (max_efficiency < coarser_efficiency) {
|
if (coarser_efficiency + 0.01 >= max_efficiency) {
|
||||||
max_efficiency = coarser_efficiency;
|
// Taking it.
|
||||||
}
|
block_size = coarser_block_size;
|
||||||
|
block_count = coarser_block_count;
|
||||||
|
if (max_efficiency < coarser_efficiency) {
|
||||||
|
max_efficiency = coarser_efficiency;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -251,26 +253,20 @@ struct ThreadPoolDevice {
|
|||||||
}
|
}
|
||||||
// Split into halves and submit to the pool.
|
// Split into halves and submit to the pool.
|
||||||
Index mid = first + divup((last - first) / 2, block_size) * block_size;
|
Index mid = first + divup((last - first) / 2, block_size) * block_size;
|
||||||
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
|
enqueue_func([=, &handleRange]() { handleRange(mid, last); });
|
||||||
pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
|
enqueue_func([=, &handleRange]() { handleRange(first, mid); });
|
||||||
};
|
};
|
||||||
handleRange(0, size);
|
handleRange(0, n);
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
// Convinience wrapper for parallelFor that does not align blocks.
|
||||||
static Index alignBlockSize(Index size, Index hard_align, Index soft_align) {
|
void parallelFor(Index n, const TensorOpCost& cost,
|
||||||
if (soft_align > hard_align && size >= 4 * soft_align) {
|
std::function<void(Index, Index)> f) const {
|
||||||
// Align to soft_align, if it won't increase size by more than 25%.
|
parallelFor(n, cost, nullptr, std::move(f));
|
||||||
return (size + soft_align - 1) & ~(soft_align - 1);
|
|
||||||
}
|
|
||||||
if (hard_align > 0) {
|
|
||||||
return (size + hard_align - 1) & ~(hard_align - 1);
|
|
||||||
}
|
|
||||||
return size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
ThreadPoolInterface* pool_;
|
ThreadPoolInterface* pool_;
|
||||||
size_t num_threads_;
|
size_t num_threads_;
|
||||||
};
|
};
|
||||||
|
@ -137,6 +137,13 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
|||||||
{
|
{
|
||||||
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
#if defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL)
|
||||||
|
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||||
|
EvalRange::alignBlockSize,
|
||||||
|
[&evaluator](Index first, Index last) {
|
||||||
|
EvalRange::run(&evaluator, first, last);
|
||||||
|
});
|
||||||
|
#else
|
||||||
size_t num_threads = device.numThreads();
|
size_t num_threads = device.numThreads();
|
||||||
#ifdef EIGEN_USE_COST_MODEL
|
#ifdef EIGEN_USE_COST_MODEL
|
||||||
if (num_threads > 1) {
|
if (num_threads > 1) {
|
||||||
@ -163,11 +170,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
|||||||
}
|
}
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
}
|
}
|
||||||
|
#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
#endif // EIGEN_USE_THREADS
|
||||||
|
|
||||||
|
|
||||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||||
|
@ -13,77 +13,6 @@
|
|||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
|
||||||
/** \internal
|
|
||||||
* \brief Template functor to compute the modulo between an array and a scalar.
|
|
||||||
*/
|
|
||||||
template <typename Scalar>
|
|
||||||
struct scalar_mod_op {
|
|
||||||
EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
|
|
||||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
|
|
||||||
const Scalar m_divisor;
|
|
||||||
};
|
|
||||||
template <typename Scalar>
|
|
||||||
struct functor_traits<scalar_mod_op<Scalar> >
|
|
||||||
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
|
||||||
|
|
||||||
|
|
||||||
/** \internal
|
|
||||||
* \brief Template functor to compute the modulo between 2 arrays.
|
|
||||||
*/
|
|
||||||
template <typename Scalar>
|
|
||||||
struct scalar_mod2_op {
|
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
|
|
||||||
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
|
|
||||||
};
|
|
||||||
template <typename Scalar>
|
|
||||||
struct functor_traits<scalar_mod2_op<Scalar> >
|
|
||||||
{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
|
|
||||||
|
|
||||||
template <typename Scalar>
|
|
||||||
struct scalar_fmod_op {
|
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
|
|
||||||
operator()(const Scalar& a, const Scalar& b) const {
|
|
||||||
return numext::fmod(a, b);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template <typename Scalar>
|
|
||||||
struct functor_traits<scalar_fmod_op<Scalar> > {
|
|
||||||
enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell.
|
|
||||||
PacketAccess = false };
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** \internal
|
|
||||||
* \brief Template functor to compute the sigmoid of a scalar
|
|
||||||
* \sa class CwiseUnaryOp, ArrayBase::sigmoid()
|
|
||||||
*/
|
|
||||||
template <typename T>
|
|
||||||
struct scalar_sigmoid_op {
|
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
|
|
||||||
const T one = T(1);
|
|
||||||
return one / (one + numext::exp(-x));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
|
||||||
Packet packetOp(const Packet& x) const {
|
|
||||||
const Packet one = pset1<Packet>(T(1));
|
|
||||||
return pdiv(one, padd(one, pexp(pnegate(x))));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct functor_traits<scalar_sigmoid_op<T> > {
|
|
||||||
enum {
|
|
||||||
Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
|
|
||||||
PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
|
|
||||||
packet_traits<T>::HasNegate && packet_traits<T>::HasExp
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// Standard reduction functors
|
// Standard reduction functors
|
||||||
template <typename T> struct SumReducer
|
template <typename T> struct SumReducer
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user