mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Merged eigen/eigen into default
This commit is contained in:
commit
cb3c059fa4
@ -406,18 +406,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
|
||||
predux_half_dowto4(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the product of the elements of \a a*/
|
||||
/** \internal \returns the product of the elements of \a a */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the min of the elements of \a a*/
|
||||
/** \internal \returns the min of the elements of \a a */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the max of the elements of \a a*/
|
||||
/** \internal \returns the max of the elements of \a a */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns true if all coeffs of \a a means "true"
|
||||
* It is supposed to be called on values returned by pcmp_*.
|
||||
*/
|
||||
// not needed yet
|
||||
// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
|
||||
// { return bool(a); }
|
||||
|
||||
/** \internal \returns true if any coeffs of \a a means "true"
|
||||
* It is supposed to be called on values returned by pcmp_*.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
|
||||
{
|
||||
// Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
|
||||
// It is expected that "true" is either:
|
||||
// - Scalar(1)
|
||||
// - bits full of ones (NaN for floats),
|
||||
// - or first bit equals to 1 (1 for ints, smallest denormal for floats).
|
||||
// For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
|
||||
return bool(predux(a));
|
||||
}
|
||||
|
||||
/** \internal \returns the reversed elements of \a a*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
|
||||
{ return a; }
|
||||
|
@ -576,6 +576,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
|
||||
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
|
||||
}
|
||||
|
||||
// not needed yet
|
||||
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
|
||||
// {
|
||||
// return _mm256_movemask_ps(x)==0xFF;
|
||||
// }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
|
||||
{
|
||||
return _mm256_movemask_ps(x)!=0;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8f>
|
||||
|
@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
// The smallest non denormalized float number.
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
|
||||
|
||||
// Polynomial coefficients.
|
||||
@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
x = padd(x, y);
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
__mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
|
||||
// Filter out invalid inputs, i.e.:
|
||||
// - negative arg will be NAN,
|
||||
// - 0 will be -INF.
|
||||
// - +INF will be +INF
|
||||
return _mm512_mask_blend_ps(iszero_mask,
|
||||
_mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
|
||||
p16f_minus_inf);
|
||||
_mm512_mask_blend_ps(invalid_mask,
|
||||
_mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
|
||||
p16f_nan),
|
||||
p16f_minus_inf);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -283,9 +283,27 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
|
||||
__m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b));
|
||||
__m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b));
|
||||
return cat256(lo, hi);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
|
||||
__m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b));
|
||||
__m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b));
|
||||
return cat256(lo, hi);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
|
||||
__m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b));
|
||||
__m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b));
|
||||
return cat256(lo, hi);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
|
||||
__m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ);
|
||||
__m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ);
|
||||
__m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b));
|
||||
__m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b));
|
||||
return cat256(lo, hi);
|
||||
}
|
||||
|
||||
@ -964,6 +982,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
|
||||
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
|
||||
{
|
||||
Packet16i xi = _mm512_castps_si512(x);
|
||||
__mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
|
||||
return !_mm512_kortestz(tmp,tmp);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet16f> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet16f& first,
|
||||
|
@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||
return pfirst(res);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
{
|
||||
return vec_any_ne(x, pzero(x));
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet4f>
|
||||
{
|
||||
|
@ -269,88 +269,121 @@ EIGEN_UNUSED
|
||||
Packet psincos_float(const Packet& _x)
|
||||
{
|
||||
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
|
||||
const Packet cst_1 = pset1<Packet>(1.0f);
|
||||
const Packet cst_half = pset1<Packet>(0.5f);
|
||||
|
||||
const PacketI csti_1 = pset1<PacketI>(1);
|
||||
const PacketI csti_not1 = pset1<PacketI>(~1);
|
||||
const PacketI csti_2 = pset1<PacketI>(2);
|
||||
const PacketI csti_3 = pset1<PacketI>(3);
|
||||
|
||||
const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
|
||||
|
||||
const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
|
||||
const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
|
||||
const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
|
||||
const Packet cst_sincof_p0 = pset1<Packet>(-1.9515295891E-4f);
|
||||
const Packet cst_sincof_p1 = pset1<Packet>( 8.3321608736E-3f);
|
||||
const Packet cst_sincof_p2 = pset1<Packet>(-1.6666654611E-1f);
|
||||
const Packet cst_coscof_p0 = pset1<Packet>( 2.443315711809948E-005f);
|
||||
const Packet cst_coscof_p1 = pset1<Packet>(-1.388731625493765E-003f);
|
||||
const Packet cst_coscof_p2 = pset1<Packet>( 4.166664568298827E-002f);
|
||||
const Packet cst_cephes_FOPI = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
|
||||
const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
|
||||
const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
|
||||
const PacketI csti_1 = pset1<PacketI>(1);
|
||||
const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
|
||||
|
||||
Packet x = pabs(_x);
|
||||
|
||||
// Scale x by 4/Pi to find x's octant.
|
||||
Packet y = pmul(x, cst_cephes_FOPI);
|
||||
// Scale x by 2/Pi to find x's octant.
|
||||
Packet y = pmul(x, cst_2oPI);
|
||||
|
||||
// Get the octant. We'll reduce x by this number of octants or by one more than it.
|
||||
PacketI y_int = pcast<Packet,PacketI>(y);
|
||||
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
|
||||
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
|
||||
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
|
||||
PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
|
||||
y = pcast<PacketI,Packet>(y_int1);
|
||||
// Rounding trick:
|
||||
Packet y_round = padd(y, cst_rounding_magic);
|
||||
PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
|
||||
y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
|
||||
|
||||
// Compute the sign to apply to the polynomial.
|
||||
// sign = third_bit(y_int1) xor signbit(_x)
|
||||
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1)))
|
||||
: preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3)));
|
||||
// sin: sign = second_bit(y_int) xor signbit(_x)
|
||||
// cos: sign = second_bit(y_int+1)
|
||||
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
|
||||
: preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
|
||||
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
||||
|
||||
// Get the polynomial selection mask from the second bit of y_int1
|
||||
// Get the polynomial selection mask from the second bit of y_int
|
||||
// We'll calculate both (sin and cos) polynomials and then select from the two.
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
|
||||
|
||||
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
|
||||
// The magic pass: "Extended precision modular arithmetic"
|
||||
// x = ((x - y * DP1) - y * DP2) - y * DP3
|
||||
x = pmadd(y, cst_minus_cephes_DP1, x);
|
||||
x = pmadd(y, cst_minus_cephes_DP2, x);
|
||||
x = pmadd(y, cst_minus_cephes_DP3, x);
|
||||
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
|
||||
// using "Extended precision modular arithmetic"
|
||||
#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
|
||||
// This version requires true FMA for high accuracy
|
||||
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
|
||||
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
|
||||
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
|
||||
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
|
||||
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
|
||||
#else
|
||||
// Without true FMA, the previous set of coefficients maintain 1ULP accuracy
|
||||
// up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
|
||||
// We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
|
||||
|
||||
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
|
||||
// and 2 ULP up to:
|
||||
const float huge_th = ComputeSine ? 25966.f : 18838.f;
|
||||
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
|
||||
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
|
||||
x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
|
||||
x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
|
||||
|
||||
// For the record, the following set of coefficients maintain 2ULP up
|
||||
// to a slightly larger range:
|
||||
// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
|
||||
// but it slightly fails to maintain 1ULP for two values of sin below pi.
|
||||
// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
|
||||
// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
|
||||
// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
|
||||
// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
|
||||
|
||||
// For the record, with only 3 iterations it is possible to maintain
|
||||
// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
|
||||
// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
|
||||
#endif
|
||||
|
||||
// We use huge_vals as a temporary for abs(_x) to ensure huge_vals
|
||||
// is fully initialized for the last pselect(). (prevent compiler warning)
|
||||
Packet huge_vals = pabs(_x);
|
||||
Packet huge_mask = pcmp_le(pset1<Packet>(huge_th),huge_vals);
|
||||
|
||||
if(predux_any(huge_mask))
|
||||
{
|
||||
const int PacketSize = unpacket_traits<Packet>::size;
|
||||
#if EIGEN_HAS_CXX11
|
||||
alignas(Packet) float vals[PacketSize];
|
||||
#else
|
||||
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
|
||||
#endif
|
||||
pstoreu(vals, _x);
|
||||
for(int k=0; k<PacketSize;++k) {
|
||||
float val = vals[k];
|
||||
if(numext::abs(val)>=huge_th) {
|
||||
vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
|
||||
}
|
||||
}
|
||||
huge_vals = ploadu<Packet>(vals);
|
||||
}
|
||||
|
||||
Packet x2 = pmul(x,x);
|
||||
|
||||
// Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
|
||||
Packet y1 = cst_coscof_p0;
|
||||
y1 = pmadd(y1, x2, cst_coscof_p1);
|
||||
y1 = pmadd(y1, x2, cst_coscof_p2);
|
||||
y1 = pmul(y1, x2);
|
||||
y1 = pmul(y1, x2);
|
||||
y1 = psub(y1, pmul(x2, cst_half));
|
||||
y1 = padd(y1, cst_1);
|
||||
// Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
|
||||
Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
|
||||
y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
|
||||
y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
|
||||
y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
|
||||
y1 = pmadd(y1, x2, pset1<Packet>(1.f));
|
||||
|
||||
// Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0)
|
||||
Packet y2 = cst_sincof_p0;
|
||||
y2 = pmadd(y2, x2, cst_sincof_p1);
|
||||
y2 = pmadd(y2, x2, cst_sincof_p2);
|
||||
// Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
|
||||
// octave/matlab code to compute those coefficients:
|
||||
// x = (0:0.0001:pi/4)';
|
||||
// A = [x.^3 x.^5 x.^7];
|
||||
// w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
|
||||
// c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
|
||||
// printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
|
||||
//
|
||||
Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
|
||||
y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
|
||||
y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
|
||||
y2 = pmul(y2, x2);
|
||||
y2 = pmadd(y2, x, x);
|
||||
|
||||
// Select the correct result from the two polynoms.
|
||||
// Select the correct result from the two polynomials.
|
||||
y = ComputeSine ? pselect(poly_mask,y2,y1)
|
||||
: pselect(poly_mask,y1,y2);
|
||||
|
||||
// For very large arguments the the reduction to the [-Pi/4,+Pi/4] range
|
||||
// does not work thus leading to sine/cosine out of the [-1:1] range.
|
||||
// Since computing the sine/cosine for very large entry entries makes little
|
||||
// sense in term of accuracy, we simply clamp to [-1,1]:
|
||||
y = pmin(y,pset1<Packet>( 1.f));
|
||||
y = pmax(y,pset1<Packet>(-1.f));
|
||||
|
||||
// Update the sign
|
||||
return pxor(y, sign_bit);
|
||||
// Update the sign and filter huge inputs
|
||||
return pselect(huge_mask, huge_vals, pxor(y, sign_bit));
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
|
@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
|
||||
return vget_lane_s32(max, 0);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
{
|
||||
uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
|
||||
vget_high_u32(vreinterpretq_u32_f32(x)));
|
||||
return vget_lane_u32(vpmax_u32(tmp,tmp),0);
|
||||
}
|
||||
|
||||
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
|
||||
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
|
||||
#define PALIGN_NEON(Offset,Type,Command) \
|
||||
|
@ -813,6 +813,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||
#endif // EIGEN_VECTORIZE_SSE4_1
|
||||
}
|
||||
|
||||
// not needed yet
|
||||
// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
|
||||
// {
|
||||
// return _mm_movemask_ps(x) == 0xF;
|
||||
// }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
{
|
||||
return _mm_movemask_ps(x) != 0x0;
|
||||
}
|
||||
|
||||
#if EIGEN_COMP_GNUC
|
||||
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
|
||||
// {
|
||||
|
@ -104,7 +104,8 @@
|
||||
STORAGE_INDEX_MUST_MATCH=1,
|
||||
CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
|
||||
SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
|
||||
INVALID_TEMPLATE_PARAMETER=1
|
||||
INVALID_TEMPLATE_PARAMETER=1,
|
||||
GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -577,17 +577,22 @@ template<typename Scalar,typename Packet> void packetmath_real()
|
||||
if(PacketTraits::HasCos)
|
||||
{
|
||||
packet_helper<PacketTraits::HasCos,Packet> h;
|
||||
for(Scalar k = 1; k<Scalar(1000)/std::numeric_limits<Scalar>::epsilon(); k*=2) {
|
||||
data1[0] = k*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
|
||||
data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random<Scalar>(0.8,1.2);
|
||||
h.store(data2, internal::pcos(h.load(data1)));
|
||||
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
|
||||
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
|
||||
data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
|
||||
data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
|
||||
h.store(data2, internal::psin(h.load(data1)));
|
||||
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
|
||||
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
|
||||
for(Scalar k = 1; k<Scalar(10000)/std::numeric_limits<Scalar>::epsilon(); k*=2)
|
||||
{
|
||||
for(int k1=0;k1<=1; ++k1)
|
||||
{
|
||||
data1[0] = (2*k+k1 )*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
|
||||
data1[1] = (2*k+2+k1)*Scalar(EIGEN_PI)/2 * internal::random<Scalar>(0.8,1.2);
|
||||
h.store(data2, internal::pcos(h.load(data1)));
|
||||
h.store(data2+PacketSize, internal::psin(h.load(data1)));
|
||||
VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.));
|
||||
VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.));
|
||||
VERIFY(data2[PacketSize+0]<=Scalar(1.) && data2[PacketSize+0]>=Scalar(-1.));
|
||||
VERIFY(data2[PacketSize+1]<=Scalar(1.) && data2[PacketSize+1]>=Scalar(-1.));
|
||||
|
||||
VERIFY_IS_APPROX(numext::abs2(data2[0])+numext::abs2(data2[PacketSize+0]), Scalar(1));
|
||||
VERIFY_IS_APPROX(numext::abs2(data2[1])+numext::abs2(data2[PacketSize+1]), Scalar(1));
|
||||
}
|
||||
}
|
||||
|
||||
data1[0] = std::numeric_limits<Scalar>::infinity();
|
||||
@ -605,6 +610,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
|
||||
VERIFY((numext::isnan)(data2[0]));
|
||||
h.store(data2, internal::pcos(h.load(data1)));
|
||||
VERIFY((numext::isnan)(data2[0]));
|
||||
|
||||
data1[0] = -Scalar(0.);
|
||||
h.store(data2, internal::psin(h.load(data1)));
|
||||
VERIFY( internal::biteq(data2[0], data1[0]) );
|
||||
h.store(data2, internal::pcos(h.load(data1)));
|
||||
VERIFY_IS_EQUAL(data2[0], Scalar(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -642,6 +653,29 @@ template<typename Scalar,typename Packet> void packetmath_notcomplex()
|
||||
ref[i] = data1[0]+Scalar(i);
|
||||
internal::pstore(data2, internal::plset<Packet>(data1[0]));
|
||||
VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset");
|
||||
|
||||
{
|
||||
unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
|
||||
// predux_all - not needed yet
|
||||
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
|
||||
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
|
||||
// for(int k=0; k<PacketSize; ++k)
|
||||
// {
|
||||
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
|
||||
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
|
||||
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
|
||||
// }
|
||||
|
||||
// predux_any
|
||||
for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0x0;
|
||||
VERIFY( (!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
|
||||
for(int k=0; k<PacketSize; ++k)
|
||||
{
|
||||
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
|
||||
VERIFY( internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
|
||||
for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x00;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Scalar,typename Packet,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)
|
||||
|
@ -1219,9 +1219,6 @@ template<typename Indices, typename LeftArgType, typename RightArgType, typename
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
|
||||
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
|
||||
|
||||
static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
|
||||
"GPU tensor contraction does not support output kernels.");
|
||||
|
||||
typedef GpuDevice Device;
|
||||
|
||||
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
|
||||
@ -1274,7 +1271,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
typedef typename RightEvaluator::Dimensions RightDimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) {}
|
||||
Base(op, device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
|
||||
GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
|
||||
}
|
||||
|
||||
// We need to redefine this method to make nvcc happy
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
|
@ -756,6 +756,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
}
|
||||
}
|
||||
|
||||
template <int Alignment>
|
||||
EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0,
|
||||
const Scalar* src_buf1,
|
||||
const Scalar* src_buf2,
|
||||
Scalar* dst_buf) const {
|
||||
using ::Eigen::internal::padd;
|
||||
using ::Eigen::internal::pload;
|
||||
using ::Eigen::internal::ploadt;
|
||||
using ::Eigen::internal::pstoret;
|
||||
|
||||
const int output_packet_size =
|
||||
internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
size_t i = 0;
|
||||
const size_t num_packets = n / output_packet_size;
|
||||
for (; i < output_packet_size * num_packets; i += output_packet_size) {
|
||||
const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
|
||||
const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
|
||||
const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
|
||||
|
||||
const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
|
||||
const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
|
||||
|
||||
pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
|
||||
}
|
||||
for (; i < n; ++i) {
|
||||
dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Decide whether we want to shard m x k x n contraction over the inner
|
||||
// (contraction) dimension (k).
|
||||
static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
|
||||
@ -788,50 +818,145 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index m = this->m_i_size;
|
||||
const Index n = this->m_j_size;
|
||||
const Index k = this->m_k_size;
|
||||
const Index packet_size = internal::packet_traits<RhsScalar>::size;
|
||||
const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
|
||||
|
||||
// We will compute partial results into the buffers of this size.
|
||||
const Index buffer_size_bytes = m * n * sizeof(Scalar);
|
||||
|
||||
// The underlying GEMM kernel assumes that k is a multiple of
|
||||
// the packet size and subtle breakage occurs if this is violated.
|
||||
Index block_size = kmultiple * divup<Index>(k, kmultiple * num_threads);
|
||||
Index num_blocks = divup<Index>(k, block_size);
|
||||
// we use 'result' for the first block's partial result.
|
||||
MaxSizeVector<Scalar*> block_buffers(num_blocks - 1);
|
||||
Barrier barrier(internal::convert_index<int>(num_blocks));
|
||||
auto process_block = [=, &barrier](Scalar* buf, Index begin, Index end) {
|
||||
::memset(buf, 0, m * n * sizeof(Scalar));
|
||||
const Index packet_size = internal::packet_traits<RhsScalar>::size;
|
||||
|
||||
const auto round_up = [=](Index index) -> Index {
|
||||
const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
|
||||
return divup<Index>(index, kmultiple) * kmultiple;
|
||||
};
|
||||
|
||||
// Cost model doesn't capture well the cost associated with constructing
|
||||
// tensor contraction mappers and computing loop bounds in gemm_pack_lhs and
|
||||
// gemm_pack_rhs, so we specify minimum desired block size.
|
||||
const Index target_block_size = round_up(divup<Index>(k, num_threads));
|
||||
const Index desired_min_block_size = 12 * packet_size;
|
||||
|
||||
const Index block_size = numext::mini<Index>(
|
||||
k, numext::maxi<Index>(desired_min_block_size, target_block_size));
|
||||
const Index num_blocks = divup<Index>(k, block_size);
|
||||
|
||||
// Compute block size with accounting for potentially incomplete last block.
|
||||
const auto actual_block_size = [=](Index block_idx) -> Index {
|
||||
return block_idx + 1 < num_blocks
|
||||
? block_size
|
||||
: k + block_size - block_size * num_blocks;
|
||||
};
|
||||
|
||||
// We compute partial gemm results in parallel, and to get the final result
|
||||
// we need to add them all together. For the large number of threads (>= 48)
|
||||
// this adds a very expensive sequential step at the end.
|
||||
//
|
||||
// We split the [0, num_blocks) into small ranges, and when a task for the
|
||||
// block finishes its partial gemm computation, it checks if it was the last
|
||||
// gemm in the range, and if so, it will add all blocks of the range.
|
||||
//
|
||||
// After all tasks finihes, we need to add only these pre-aggregated blocks.
|
||||
|
||||
// Compute range size with accounting for potentially incomplete last range.
|
||||
const auto actual_range_size = [=](Index num_ranges, Index range_size,
|
||||
Index range_idx) -> Index {
|
||||
eigen_assert(range_idx < num_ranges);
|
||||
return range_idx + 1 < num_ranges
|
||||
? range_size
|
||||
: num_blocks + range_size - range_size * num_ranges;
|
||||
};
|
||||
|
||||
// For now we use just a single level of ranges to compute pre-aggregated
|
||||
// partial sums, but in general we can use more layers to compute tree
|
||||
// aggregation in parallel and reduce the size of the sequential step.
|
||||
//
|
||||
// TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
|
||||
// sense only if number of threads >= ~128?
|
||||
static const Index l0_size = 4;
|
||||
const Index l0_ranges = divup<Index>(num_blocks, l0_size);
|
||||
|
||||
// Keep count of pending gemm tasks for each l0 range.
|
||||
MaxSizeVector<std::atomic<int>> l0_state(l0_ranges);
|
||||
for (int i = 0; i < l0_ranges; ++i) {
|
||||
l0_state.emplace_back(actual_range_size(l0_ranges, l0_size, i));
|
||||
}
|
||||
|
||||
MaxSizeVector<Scalar*> block_buffers(num_blocks);
|
||||
|
||||
auto process_block = [&, this](Index block_idx, Index begin, Index end) {
|
||||
Scalar* buf = block_buffers[block_idx];
|
||||
::memset(buf, 0, buffer_size_bytes);
|
||||
|
||||
TENSOR_CONTRACTION_DISPATCH(
|
||||
this->template evalGemmPartialWithoutOutputKernel, Alignment,
|
||||
(buf, begin, end, this->m_device.numThreads()));
|
||||
barrier.Notify();
|
||||
(buf, begin, end, /*num_threads=*/num_blocks));
|
||||
|
||||
// Check if it was the last task in l0 range.
|
||||
const Index l0_index = block_idx / l0_size;
|
||||
const int v = l0_state[l0_index].fetch_sub(1);
|
||||
eigen_assert(v >= 1);
|
||||
|
||||
// If we processed the last block of the range, we can aggregate all
|
||||
// partial results into the first block of the range.
|
||||
if (v == 1) {
|
||||
const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index);
|
||||
const Index dst_block_idx = l0_index * l0_size;
|
||||
|
||||
if (rng_size == l0_size) {
|
||||
addAllToBuffer<Alignment>(
|
||||
m * n,
|
||||
/*src_buf0=*/block_buffers[dst_block_idx + 1],
|
||||
/*src_buf1=*/block_buffers[dst_block_idx + 2],
|
||||
/*src_buf2=*/block_buffers[dst_block_idx + 3],
|
||||
/*dst_buf= */ block_buffers[dst_block_idx]);
|
||||
} else {
|
||||
// Aggregate blocks of potentially incomplete last range.
|
||||
for (int i = 1; i < rng_size; ++i) {
|
||||
addToBuffer<Alignment>(m * n,
|
||||
/*src_buf=*/block_buffers[dst_block_idx + i],
|
||||
/*dst_buf=*/block_buffers[dst_block_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
Index start = 0;
|
||||
for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) {
|
||||
// The underlying GEMM kernel assumes that k is a multiple of packet size
|
||||
// (currently largest packet size is 16) and subtle breakage occurs if
|
||||
// this is violated.
|
||||
block_size = kmultiple * divup<Index>(k - start, kmultiple * blocks_left);
|
||||
Scalar* buf;
|
||||
if (start == 0) {
|
||||
buf = result;
|
||||
} else {
|
||||
buf = static_cast<Scalar*>(
|
||||
this->m_device.allocate(m * n * sizeof(Scalar)));
|
||||
block_buffers.push_back(buf);
|
||||
}
|
||||
Index end = start + block_size;
|
||||
if (end > k) {
|
||||
end = k;
|
||||
}
|
||||
this->m_device.enqueueNoNotification(
|
||||
[=, &process_block]() { process_block(buf, start, end); });
|
||||
start = end;
|
||||
|
||||
Barrier barrier(internal::convert_index<int>(num_blocks));
|
||||
for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
|
||||
Scalar* buf = block_idx == 0
|
||||
? result
|
||||
: static_cast<Scalar*>(
|
||||
this->m_device.allocate(buffer_size_bytes));
|
||||
block_buffers.push_back(buf);
|
||||
|
||||
Index block_start = block_idx * block_size;
|
||||
Index block_end = block_start + actual_block_size(block_idx);
|
||||
|
||||
this->m_device.enqueueNoNotification([=, &barrier, &process_block]() {
|
||||
process_block(block_idx, block_start, block_end);
|
||||
barrier.Notify();
|
||||
});
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Add other partial results into first partial result.
|
||||
for (const auto& buf : block_buffers) {
|
||||
addToBuffer<Alignment>(m * n, buf, result);
|
||||
this->m_device.deallocate(buf);
|
||||
// Aggregate partial sums from l0 ranges.
|
||||
Index l0_index = 1;
|
||||
for (; l0_index + 2 < l0_ranges; l0_index += 3) {
|
||||
addAllToBuffer<Alignment>(
|
||||
m * n,
|
||||
/*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
|
||||
/*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
|
||||
/*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
|
||||
/*dst_buf= */block_buffers[0]);
|
||||
}
|
||||
for (; l0_index < l0_ranges; ++l0_index) {
|
||||
addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
|
||||
block_buffers[0]);
|
||||
}
|
||||
|
||||
// Don't forget to deallocate ALL temporary buffers.
|
||||
for (Index i = 1; i < num_blocks; ++i) {
|
||||
this->m_device.deallocate(block_buffers[i]);
|
||||
}
|
||||
|
||||
// Finally call output kernel with finalized output buffer.
|
||||
|
@ -258,7 +258,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
|
||||
endif()
|
||||
|
||||
if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
|
||||
if(( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) AND EIGEN_TEST_CXX11)
|
||||
set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
|
||||
else()
|
||||
# otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
|
||||
|
@ -17,6 +17,8 @@
|
||||
|
||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
#define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
void test_gpu_nullary() {
|
||||
@ -617,6 +619,7 @@ void test_gpu_convolution_3d()
|
||||
}
|
||||
|
||||
|
||||
#if EIGEN_GPU_TEST_C99_MATH
|
||||
template <typename Scalar>
|
||||
void test_gpu_lgamma(const Scalar stddev)
|
||||
{
|
||||
@ -655,6 +658,7 @@ void test_gpu_lgamma(const Scalar stddev)
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gpu_digamma()
|
||||
@ -986,6 +990,7 @@ void test_gpu_igammac()
|
||||
gpuFree(d_out);
|
||||
}
|
||||
|
||||
#if EIGEN_GPU_TEST_C99_MATH
|
||||
template <typename Scalar>
|
||||
void test_gpu_erf(const Scalar stddev)
|
||||
{
|
||||
@ -1063,6 +1068,7 @@ void test_gpu_erfc(const Scalar stddev)
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gpu_betainc()
|
||||
@ -1494,7 +1500,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
|
||||
CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
|
||||
#endif
|
||||
|
||||
#if __cplusplus > 199711L
|
||||
#if EIGEN_GPU_TEST_C99_MATH
|
||||
// std::erf, std::erfc, and so on where only added in c++11. We use them
|
||||
// as a golden reference to validate the results produced by Eigen. Therefore
|
||||
// we can only run these tests if we use a c++11 compiler.
|
||||
|
Loading…
x
Reference in New Issue
Block a user