mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Vectorize acos, asin, and atan for float.
This commit is contained in:
parent
e5af9f87f2
commit
bd393e15c3
@ -32,6 +32,24 @@ pcos<Packet8f>(const Packet8f& _x) {
|
||||
return pcos_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
|
||||
pasin<Packet8f>(const Packet8f& _x) {
|
||||
return pasin_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
|
||||
pacos<Packet8f>(const Packet8f& _x) {
|
||||
return pacos_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
|
||||
patan<Packet8f>(const Packet8f& _x) {
|
||||
return patan_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
|
||||
plog<Packet8f>(const Packet8f& _x) {
|
||||
|
@ -73,6 +73,9 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasReciprocal = EIGEN_FAST_MATH,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1,
|
||||
HasExpm1 = 1,
|
||||
|
@ -257,6 +257,24 @@ pcos<Packet16f>(const Packet16f& _x) {
|
||||
return pcos_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
|
||||
pacos<Packet16f>(const Packet16f& _x) {
|
||||
return pacos_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
|
||||
pasin<Packet16f>(const Packet16f& _x) {
|
||||
return pasin_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
|
||||
patan<Packet16f>(const Packet16f& _x) {
|
||||
return patan_float(_x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
|
||||
ptanh<Packet16f>(const Packet16f& _x) {
|
||||
|
@ -122,6 +122,9 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasBlend = 0,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
#if EIGEN_HAS_AVX512_MATH
|
||||
HasLog = 1,
|
||||
HasLog1p = 1,
|
||||
|
@ -42,6 +42,24 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
|
||||
return pcos_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f pacos<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return pacos_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f pasin<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return pasin_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f patan<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return patan_float(_x);
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
#ifndef EIGEN_COMP_CLANG
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
@ -168,6 +168,9 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasAbs = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
#ifdef __VSX__
|
||||
|
@ -719,6 +719,151 @@ Packet pcos_float(const Packet& x)
|
||||
return psincos_float<false>(x);
|
||||
}
|
||||
|
||||
// Generic implementation of acos(x).
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pacos_float(const Packet& x_in) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
||||
|
||||
const Packet cst_one = pset1<Packet>(Scalar(1));
|
||||
const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
|
||||
const Packet p6 = pset1<Packet>(Scalar(2.26911413483321666717529296875e-3));
|
||||
const Packet p5 = pset1<Packet>(Scalar(-1.1063250713050365447998046875e-2));
|
||||
const Packet p4 = pset1<Packet>(Scalar(2.680264413356781005859375e-2));
|
||||
const Packet p3 = pset1<Packet>(Scalar(-4.87488098442554473876953125e-2));
|
||||
const Packet p2 = pset1<Packet>(Scalar(8.874166011810302734375e-2));
|
||||
const Packet p1 = pset1<Packet>(Scalar(-0.2145837843418121337890625));
|
||||
const Packet p0 = pset1<Packet>(Scalar(1.57079613208770751953125));
|
||||
|
||||
// For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
|
||||
// function, by a 6'th order polynomial.
|
||||
// For x in [-1:0) we use that acos(-x) = pi - acos(x).
|
||||
const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
|
||||
Packet x = pabs(x_in);
|
||||
const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
|
||||
|
||||
// Evaluate the polynomial using Horner's rule:
|
||||
// P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
|
||||
// We evaluate even and odd terms independently to increase
|
||||
// instruction level parallelism.
|
||||
Packet x2 = pmul(x_in,x_in);
|
||||
Packet p_even = pmadd(p6, x2, p4);
|
||||
Packet p_odd = pmadd(p5, x2, p3);
|
||||
p_even = pmadd(p_even, x2, p2);
|
||||
p_odd = pmadd(p_odd, x2, p1);
|
||||
p_even = pmadd(p_even, x2, p0);
|
||||
Packet p = pmadd(p_odd, x, p_even);
|
||||
|
||||
// The polynomial approximates acos(x)/sqrt(1-x), so
|
||||
// multiply by sqrt(1-x) to get acos(x).
|
||||
Packet denom = psqrt(psub(cst_one, x));
|
||||
Packet result = pmul(denom, p);
|
||||
|
||||
// Undo mapping for negative arguments.
|
||||
result = pselect(neg_mask, psub(cst_pi, result), result);
|
||||
// Return NaN for arguments outside [-1:1].
|
||||
return pselect(invalid_mask,
|
||||
pset1<Packet>(std::numeric_limits<float>::quiet_NaN()),
|
||||
result);
|
||||
}
|
||||
|
||||
// Generic implementation of asin(x).
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pasin_float(const Packet& x_in) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
||||
|
||||
// For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
|
||||
// even terms only.
|
||||
const Packet p9 = pset1<Packet>(Scalar(5.08838854730129241943359375e-2f));
|
||||
const Packet p7 = pset1<Packet>(Scalar(3.95139865577220916748046875e-2f));
|
||||
const Packet p5 = pset1<Packet>(Scalar(7.550220191478729248046875e-2f));
|
||||
const Packet p3 = pset1<Packet>(Scalar(0.16664917767047882080078125f));
|
||||
const Packet p1 = pset1<Packet>(Scalar(1.00000011920928955078125f));
|
||||
|
||||
const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
|
||||
Packet x = pabs(x_in);
|
||||
const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
|
||||
// For arguments |x| > 0.5, we map x back to [0:0.5] using
|
||||
// the transformation x_large = sqrt(0.5*(1-x)), and use the
|
||||
// identity
|
||||
// asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
|
||||
const Packet cst_half = pset1<Packet>(Scalar(0.5f));
|
||||
const Packet cst_two = pset1<Packet>(Scalar(2));
|
||||
Packet x_large = psqrt(pnmadd(cst_half, x, cst_half));
|
||||
const Packet large_mask = pcmp_lt(cst_half, x);
|
||||
x = pselect(large_mask, x_large, x);
|
||||
|
||||
// Compute polynomial.
|
||||
// x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet p = pmadd(p9, x2, p7);
|
||||
p = pmadd(p, x2, p5);
|
||||
p = pmadd(p, x2, p3);
|
||||
p = pmadd(p, x2, p1);
|
||||
p = pmul(p, x);
|
||||
|
||||
constexpr float kPiOverTwo = static_cast<float>(0.5*EIGEN_PI);
|
||||
Packet p_large = pnmadd(cst_two, p, pset1<Packet>(kPiOverTwo));
|
||||
p = pselect(large_mask, p_large, p);
|
||||
// Flip the sign for negative arguments.
|
||||
p = pselect(neg_mask, pnegate(p), p);
|
||||
|
||||
// Return NaN for arguments outside [-1:1].
|
||||
return pselect(invalid_mask, pset1<Packet>(std::numeric_limits<float>::quiet_NaN()), p);
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet patan_float(const Packet& x_in) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
||||
|
||||
const Packet cst_one = pset1<Packet>(1.0f);
|
||||
constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI/2);
|
||||
const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
|
||||
constexpr float kPiOverFour = static_cast<float>(EIGEN_PI/4);
|
||||
const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
|
||||
const Packet cst_large = pset1<Packet>(2.4142135623730950488016887f); // tan(3*pi/8);
|
||||
const Packet cst_medium = pset1<Packet>(0.4142135623730950488016887f); // tan(pi/8);
|
||||
const Packet q0 = pset1<Packet>(-0.333329379558563232421875f);
|
||||
const Packet q2 = pset1<Packet>(0.19977366924285888671875f);
|
||||
const Packet q4 = pset1<Packet>(-0.13874518871307373046875f);
|
||||
const Packet q6 = pset1<Packet>(8.044691383838653564453125e-2f);
|
||||
|
||||
const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
|
||||
Packet x = pabs(x_in);
|
||||
|
||||
// Use the same range reduction strategy (to [0:tan(pi/8)]) as the
|
||||
// Cephes library:
|
||||
// "Large": For x >= tan(3*pi/8), use atan(1/x) = pi/2 - atan(x).
|
||||
// "Medium": For x in [tan(pi/8) : tan(3*pi/8)),
|
||||
// use atan(x) = pi/4 + atan((x-1)/(x+1)).
|
||||
// "Small": For x < pi/8, approximate atan(x) directly by a polynomial
|
||||
// calculated using Sollya.
|
||||
const Packet large_mask = pcmp_lt(cst_large, x);
|
||||
x = pselect(large_mask, preciprocal(x), x);
|
||||
const Packet medium_mask = pandnot(pcmp_lt(cst_medium, x), large_mask);
|
||||
x = pselect(medium_mask, pdiv(psub(x, cst_one), padd(x, cst_one)), x);
|
||||
|
||||
// Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
|
||||
// P(x) = x + x^3 * Q(x^2),
|
||||
// where Q(x^2) is a cubic polynomial in x^2.
|
||||
const Packet x2 = pmul(x, x);
|
||||
const Packet x4 = pmul(x2, x2);
|
||||
Packet q_odd = pmadd(q6, x4, q2);
|
||||
Packet q_even = pmadd(q4, x4, q0);
|
||||
const Packet q = pmadd(q_odd, x2, q_even);
|
||||
Packet p = pmadd(q, pmul(x, x2), x);
|
||||
|
||||
// Apply transformations according to the range reduction masks.
|
||||
p = pselect(large_mask, psub(cst_pi_over_two, p), p);
|
||||
p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
|
||||
return pselect(neg_mask, pnegate(p), p);
|
||||
}
|
||||
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pdiv_complex(const Packet& x, const Packet& y) {
|
||||
|
@ -89,6 +89,21 @@ template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pcos_float(const Packet& x);
|
||||
|
||||
/** \internal \returns asin(x) for single precision float */
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pasin_float(const Packet& x);
|
||||
|
||||
/** \internal \returns acos(x) for single precision float */
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pacos_float(const Packet& x);
|
||||
|
||||
/** \internal \returns atan(x) for single precision float */
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet patan_float(const Packet& x);
|
||||
|
||||
/** \internal \returns sqrt(x) for complex types */
|
||||
template<typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
@ -34,6 +34,21 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pcos<Pac
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos<Packet4f>(const Packet4f& x)
|
||||
{ return pcos_float(x); }
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pacos<Packet2f>(const Packet2f& x)
|
||||
{ return pacos_float(x); }
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pacos<Packet4f>(const Packet4f& x)
|
||||
{ return pacos_float(x); }
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pasin<Packet2f>(const Packet2f& x)
|
||||
{ return pasin_float(x); }
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pasin<Packet4f>(const Packet4f& x)
|
||||
{ return pasin_float(x); }
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patan<Packet2f>(const Packet2f& x)
|
||||
{ return patan_float(x); }
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patan<Packet4f>(const Packet4f& x)
|
||||
{ return patan_float(x); }
|
||||
|
||||
// Hyperbolic Tangent function.
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f ptanh<Packet2f>(const Packet2f& x)
|
||||
{ return internal::generic_fast_tanh_float(x); }
|
||||
|
@ -198,6 +198,9 @@ struct packet_traits<float> : default_packet_traits
|
||||
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
|
@ -75,6 +75,24 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
|
||||
return pcos_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f pacos<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return pacos_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f pasin<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return pasin_float(_x);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet4f patan<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
return patan_float(_x);
|
||||
}
|
||||
|
||||
// Notice that for newer processors, it is counterproductive to use Newton
|
||||
// iteration for square root. In particular, Skylake and Zen2 processors
|
||||
// have approximately doubled throughput of the _mm_sqrt_ps instruction
|
||||
|
@ -139,6 +139,9 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasReciprocal = EIGEN_FAST_MATH,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1,
|
||||
HasExpm1 = 1,
|
||||
|
Loading…
x
Reference in New Issue
Block a user