mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-19 16:19:37 +08:00
Use ppolevl for polynomial evaluation in more places.
This commit is contained in:
parent
a097f728fe
commit
74dcfbbd0f
@ -42,6 +42,134 @@ struct make_integer<bfloat16> {
|
||||
typedef numext::int16_t type;
|
||||
};
|
||||
|
||||
/* polevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate polynomial
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N+1];
|
||||
*
|
||||
* y = polevl<decltype(x), N>( x, coef);
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates polynomial of degree N:
|
||||
*
|
||||
* 2 N
|
||||
* y = C + C x + C x +...+ C x
|
||||
* 0 1 2 N
|
||||
*
|
||||
* Coefficients are stored in reverse order:
|
||||
*
|
||||
* coef[0] = C , ..., coef[N] = C .
|
||||
* N 0
|
||||
*
|
||||
* The function p1evl() assumes that coef[N] = 1.0 and is
|
||||
* omitted from the array. Its calling arguments are
|
||||
* otherwise the same as polevl().
|
||||
*
|
||||
*
|
||||
* The Eigen implementation is templatized. For best speed, store
|
||||
* coef as a const array (constexpr), e.g.
|
||||
*
|
||||
* const double coef[] = {1.0, 2.0, 3.0, ...};
|
||||
*
|
||||
*/
|
||||
template <typename Packet, int N>
|
||||
struct ppolevl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
||||
const typename unpacket_traits<Packet>::type coeff[]) {
|
||||
EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
struct ppolevl<Packet, 0> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
||||
const typename unpacket_traits<Packet>::type coeff[]) {
|
||||
EIGEN_UNUSED_VARIABLE(x);
|
||||
return pset1<Packet>(coeff[0]);
|
||||
}
|
||||
};
|
||||
|
||||
/* chbevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate Chebyshev series
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N], chebevl();
|
||||
*
|
||||
* y = chbevl( x, coef, N );
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates the series
|
||||
*
|
||||
* N-1
|
||||
* - '
|
||||
* y = > coef[i] T (x/2)
|
||||
* - i
|
||||
* i=0
|
||||
*
|
||||
* of Chebyshev polynomials Ti at argument x/2.
|
||||
*
|
||||
* Coefficients are stored in reverse order, i.e. the zero
|
||||
* order term is last in the array. Note N is the number of
|
||||
* coefficients, not the order.
|
||||
*
|
||||
* If coefficients are for the interval a to b, x must
|
||||
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
||||
* entering the routine. This maps x from (a, b) to (-1, 1),
|
||||
* over which the Chebyshev polynomials are defined.
|
||||
*
|
||||
* If the coefficients are for the inverted interval, in
|
||||
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
||||
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
||||
* this becomes x -> 4a/x - 1.
|
||||
*
|
||||
*
|
||||
*
|
||||
* SPEED:
|
||||
*
|
||||
* Taking advantage of the recurrence properties of the
|
||||
* Chebyshev polynomials, the routine requires one more
|
||||
* addition per loop than evaluating a nested polynomial of
|
||||
* the same degree.
|
||||
*
|
||||
*/
|
||||
|
||||
template <typename Packet, int N>
|
||||
struct pchebevl {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
|
||||
const typename unpacket_traits<Packet>::type coef[]) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
Packet b0 = pset1<Packet>(coef[0]);
|
||||
Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
|
||||
Packet b2;
|
||||
|
||||
for (int i = 1; i < N; i++) {
|
||||
b2 = b1;
|
||||
b1 = b0;
|
||||
b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
|
||||
}
|
||||
|
||||
return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
@ -193,21 +321,14 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const
|
||||
e = psub(e, pand(cst_1, mask));
|
||||
x = padd(x, tmp);
|
||||
|
||||
// Polynomial coefficients for rational (3,3) r(x) = p(x)/q(x)
|
||||
// Polynomial coefficients for rational r(x) = p(x)/q(x)
|
||||
// approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
|
||||
const Packet cst_p1 = pset1<Packet>(1.0000000190281136f);
|
||||
const Packet cst_p2 = pset1<Packet>(1.0000000190281063f);
|
||||
const Packet cst_p3 = pset1<Packet>(0.18256296349849254f);
|
||||
const Packet cst_q1 = pset1<Packet>(1.4999999999999927f);
|
||||
const Packet cst_q2 = pset1<Packet>(0.59923249590823520f);
|
||||
const Packet cst_q3 = pset1<Packet>(0.049616247954120038f);
|
||||
constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
|
||||
constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
|
||||
|
||||
Packet p = pmadd(x, cst_p3, cst_p2);
|
||||
p = pmadd(x, p, cst_p1);
|
||||
Packet p = ppolevl<Packet, 2>::run(x, alpha);
|
||||
p = pmul(x, p);
|
||||
Packet q = pmadd(x, cst_q3, cst_q2);
|
||||
q = pmadd(x, q, cst_q1);
|
||||
q = pmadd(x, q, cst_1);
|
||||
Packet q = ppolevl<Packet, 3>::run(x, beta);
|
||||
x = pdiv(p, q);
|
||||
|
||||
// Add the logarithm of the exponent back to the result of the interpolation.
|
||||
@ -919,13 +1040,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
|
||||
const Packet cst_one = pset1<Packet>(1.0f);
|
||||
const Packet cst_two = pset1<Packet>(2.0f);
|
||||
const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
|
||||
// For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
|
||||
// even terms only.
|
||||
const Packet p9 = pset1<Packet>(5.08838854730129241943359375e-2f);
|
||||
const Packet p7 = pset1<Packet>(3.95139865577220916748046875e-2f);
|
||||
const Packet p5 = pset1<Packet>(7.550220191478729248046875e-2f);
|
||||
const Packet p3 = pset1<Packet>(0.16664917767047882080078125f);
|
||||
const Packet p1 = pset1<Packet>(1.00000011920928955078125f);
|
||||
|
||||
const Packet abs_x = pabs(x_in);
|
||||
const Packet sign_mask = pandnot(x_in, abs_x);
|
||||
@ -941,13 +1055,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
|
||||
const Packet x = pselect(large_mask, x_large, abs_x);
|
||||
const Packet x2 = pmul(x, x);
|
||||
|
||||
// Compute polynomial.
|
||||
// x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
|
||||
|
||||
Packet p = pmadd(p9, x2, p7);
|
||||
p = pmadd(p, x2, p5);
|
||||
p = pmadd(p, x2, p3);
|
||||
p = pmadd(p, x2, p1);
|
||||
// For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
|
||||
// even terms only.
|
||||
constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
|
||||
7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
|
||||
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
||||
p = pmul(p, x);
|
||||
|
||||
const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
|
||||
@ -967,34 +1079,17 @@ struct patan_reduced {
|
||||
template <>
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
|
||||
const Packet p1 = pset1<Packet>(3.3004361289279920e-01);
|
||||
const Packet p3 = pset1<Packet>(8.2704055405494614e-01);
|
||||
const Packet p5 = pset1<Packet>(7.5365702534987022e-01);
|
||||
const Packet p7 = pset1<Packet>(3.0409318473444424e-01);
|
||||
const Packet p9 = pset1<Packet>(5.2574296781008604e-02);
|
||||
const Packet p11 = pset1<Packet>(3.0917513112462781e-03);
|
||||
const Packet p13 = pset1<Packet>(2.6667153866462208e-05);
|
||||
const Packet q0 = p1;
|
||||
const Packet q2 = pset1<Packet>(9.3705509168587852e-01);
|
||||
const Packet q4 = pset1<Packet>(1.0);
|
||||
const Packet q6 = pset1<Packet>(4.9716458728465573e-01);
|
||||
const Packet q8 = pset1<Packet>(1.1548932646420353e-01);
|
||||
const Packet q10 = pset1<Packet>(1.0899150928962708e-02);
|
||||
const Packet q12 = pset1<Packet>(2.7311202462436667e-04);
|
||||
constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
|
||||
3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
|
||||
3.3004361289279920e-01};
|
||||
|
||||
constexpr double beta[] = {
|
||||
2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
|
||||
9.3705509168587852e-01, 3.3004361289279920e-01};
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet p = pmadd(p13, x2, p11);
|
||||
Packet q = pmadd(q12, x2, q10);
|
||||
p = pmadd(p, x2, p9);
|
||||
q = pmadd(q, x2, q8);
|
||||
p = pmadd(p, x2, p7);
|
||||
q = pmadd(q, x2, q6);
|
||||
p = pmadd(p, x2, p5);
|
||||
q = pmadd(q, x2, q4);
|
||||
p = pmadd(p, x2, p3);
|
||||
q = pmadd(q, x2, q2);
|
||||
p = pmadd(p, x2, p1);
|
||||
q = pmadd(q, x2, q0);
|
||||
Packet p = ppolevl<Packet, 6>::run(x2, alpha);
|
||||
Packet q = ppolevl<Packet, 6>::run(x2, beta);
|
||||
return pmul(x, pdiv(p, q));
|
||||
}
|
||||
|
||||
@ -1002,20 +1097,14 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>
|
||||
template <>
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
|
||||
const Packet p1 = pset1<Packet>(8.109951019287109375e-01f);
|
||||
const Packet p3 = pset1<Packet>(7.296695709228515625e-01f);
|
||||
const Packet p5 = pset1<Packet>(1.12026982009410858154296875e-01f);
|
||||
const Packet q0 = p1;
|
||||
const Packet q2 = pset1<Packet>(1.0f);
|
||||
const Packet q4 = pset1<Packet>(2.8318560123443603515625e-01f);
|
||||
const Packet q6 = pset1<Packet>(1.00917108356952667236328125e-02f);
|
||||
constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
|
||||
|
||||
constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
|
||||
8.109951019287109375e-01f};
|
||||
|
||||
Packet x2 = pmul(x, x);
|
||||
Packet p = pmadd(p5, x2, p3);
|
||||
Packet q = pmadd(q6, x2, q4);
|
||||
p = pmadd(p, x2, p1);
|
||||
q = pmadd(q, x2, q2);
|
||||
q = pmadd(q, x2, q0);
|
||||
Packet p = ppolevl<Packet, 2>::run(x2, alpha);
|
||||
Packet q = ppolevl<Packet, 3>::run(x2, beta);
|
||||
return pmul(x, pdiv(p, q));
|
||||
}
|
||||
|
||||
@ -1076,34 +1165,20 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x)
|
||||
// --output=tanhf.sollya --dispCoeff="dec"
|
||||
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
const T alpha_3 = pset1<T>(1.340216100e-1f);
|
||||
const T alpha_5 = pset1<T>(3.520756727e-3f);
|
||||
const T alpha_7 = pset1<T>(2.102733560e-5f);
|
||||
const T alpha_9 = pset1<T>(1.394553628e-8f);
|
||||
constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
|
||||
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
const T beta_0 = pset1<T>(1.0f);
|
||||
const T beta_2 = pset1<T>(4.673548340e-1f);
|
||||
const T beta_4 = pset1<T>(2.597254514e-2f);
|
||||
const T beta_6 = pset1<T>(3.326951409e-4f);
|
||||
const T beta_8 = pset1<T>(8.015776984e-7f);
|
||||
constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
|
||||
|
||||
// Since the polynomials are odd/even, we need x^2.
|
||||
const T x2 = pmul(x, x);
|
||||
const T x3 = pmul(x2, x);
|
||||
|
||||
// Interleave the evaluation of the numerator polynomial p and
|
||||
// denominator polynomial q.
|
||||
T p = pmadd(x2, alpha_9, alpha_7);
|
||||
T q = pmadd(x2, beta_8, beta_6);
|
||||
p = pmadd(x2, p, alpha_5);
|
||||
q = pmadd(x2, q, beta_4);
|
||||
p = pmadd(x2, p, alpha_3);
|
||||
q = pmadd(x2, q, beta_2);
|
||||
// Take advantage of the fact that alpha_1 = 1 to compute
|
||||
// x*(x^2*p + alpha_1) = x^3 * p + x.
|
||||
T p = ppolevl<T, 3>::run(x2, alpha);
|
||||
T q = ppolevl<T, 4>::run(x2, beta);
|
||||
// Take advantage of the fact that the constant term in p is 1 to compute
|
||||
// x*(x^2*p + 1) = x^3 * p + x.
|
||||
p = pmadd(x3, p, x);
|
||||
q = pmadd(x2, q, beta_0);
|
||||
|
||||
// Divide the numerator by the denominator.
|
||||
return pdiv(p, q);
|
||||
@ -1141,27 +1216,16 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x)
|
||||
// --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
|
||||
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
const T alpha_3 = pset1<T>(1.5184719640284322e-01);
|
||||
const T alpha_5 = pset1<T>(5.9809711724441161e-03);
|
||||
const T alpha_7 = pset1<T>(9.3839087674268880e-05);
|
||||
const T alpha_9 = pset1<T>(6.8644367682497074e-07);
|
||||
const T alpha_11 = pset1<T>(2.4618379131293676e-09);
|
||||
const T alpha_13 = pset1<T>(4.2303918148209176e-12);
|
||||
const T alpha_15 = pset1<T>(3.1309488231386680e-15);
|
||||
const T alpha_17 = pset1<T>(7.6534862268749319e-19);
|
||||
const T alpha_19 = pset1<T>(2.6158007860482230e-23);
|
||||
constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
|
||||
4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
|
||||
9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
|
||||
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
const T beta_0 = pset1<T>(1.0);
|
||||
const T beta_2 = pset1<T>(4.851805297361760360e-01);
|
||||
const T beta_4 = pset1<T>(3.437448108450402717e-02);
|
||||
const T beta_6 = pset1<T>(8.295161192716231542e-04);
|
||||
const T beta_8 = pset1<T>(8.785185266237658698e-06);
|
||||
const T beta_10 = pset1<T>(4.492975677839633985e-08);
|
||||
const T beta_12 = pset1<T>(1.123643448069621992e-10);
|
||||
const T beta_14 = pset1<T>(1.293019623712687916e-13);
|
||||
const T beta_16 = pset1<T>(5.782506856739003571e-17);
|
||||
const T beta_18 = pset1<T>(6.463747022670968018e-21);
|
||||
constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
|
||||
1.293019623712687916e-13, 1.123643448069621992e-10,
|
||||
4.492975677839633985e-08, 8.785185266237658698e-06,
|
||||
8.295161192716231542e-04, 3.437448108450402717e-02,
|
||||
4.851805297361760360e-01, 1.0};
|
||||
|
||||
// Since the polynomials are odd/even, we need x^2.
|
||||
const T x2 = pmul(x, x);
|
||||
@ -1169,26 +1233,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x)
|
||||
|
||||
// Interleave the evaluation of the numerator polynomial p and
|
||||
// denominator polynomial q.
|
||||
T p = pmadd(x2, alpha_19, alpha_17);
|
||||
T q = pmadd(x2, beta_18, beta_16);
|
||||
p = pmadd(x2, p, alpha_15);
|
||||
q = pmadd(x2, q, beta_14);
|
||||
p = pmadd(x2, p, alpha_13);
|
||||
q = pmadd(x2, q, beta_12);
|
||||
p = pmadd(x2, p, alpha_11);
|
||||
q = pmadd(x2, q, beta_10);
|
||||
p = pmadd(x2, p, alpha_9);
|
||||
q = pmadd(x2, q, beta_8);
|
||||
p = pmadd(x2, p, alpha_7);
|
||||
q = pmadd(x2, q, beta_6);
|
||||
p = pmadd(x2, p, alpha_5);
|
||||
q = pmadd(x2, q, beta_4);
|
||||
p = pmadd(x2, p, alpha_3);
|
||||
q = pmadd(x2, q, beta_2);
|
||||
// Take advantage of the fact that alpha_1 = 1 to compute
|
||||
// x*(x^2*p + alpha_1) = x^3 * p + x.
|
||||
T p = ppolevl<T, 8>::run(x2, alpha);
|
||||
T q = ppolevl<T, 9>::run(x2, beta);
|
||||
// Take advantage of the fact that the constant term in p is 1 to compute
|
||||
// x*(x^2*p + 1) = x^3 * p + x.
|
||||
p = pmadd(x3, p, x);
|
||||
q = pmadd(x2, q, beta_0);
|
||||
|
||||
// Divide the numerator by the denominator.
|
||||
return pdiv(p, q);
|
||||
@ -1200,18 +1249,13 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Pa
|
||||
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
|
||||
|
||||
// For |x| in [0:0.5] we use a polynomial approximation of the form
|
||||
// P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
|
||||
const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
|
||||
const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
|
||||
const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
|
||||
const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
|
||||
const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
|
||||
// P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
|
||||
constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
|
||||
0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
|
||||
const Packet x2 = pmul(x, x);
|
||||
Packet p = pmadd(C11, x2, C9);
|
||||
p = pmadd(x2, p, C7);
|
||||
p = pmadd(x2, p, C5);
|
||||
p = pmadd(x2, p, C3);
|
||||
p = pmadd(pmul(x, x2), p, x);
|
||||
const Packet x3 = pmul(x, x2);
|
||||
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
||||
p = pmadd(x3, p, x);
|
||||
|
||||
// For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
|
||||
const Packet half = pset1<Packet>(0.5f);
|
||||
@ -1234,30 +1278,16 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const P
|
||||
static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
|
||||
// For x in [-0.5:0.5] we use a rational approximation of the form
|
||||
// R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
|
||||
const Packet p0 = pset1<Packet>(1.2306328729812676e-01);
|
||||
const Packet p2 = pset1<Packet>(-2.5949536095445679e-01);
|
||||
const Packet p4 = pset1<Packet>(1.8185306179826699e-01);
|
||||
const Packet p6 = pset1<Packet>(-4.7129526768798737e-02);
|
||||
const Packet p8 = pset1<Packet>(3.3071338469301391e-03);
|
||||
constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
|
||||
-2.5949536095445679e-01, 1.2306328729812676e-01};
|
||||
|
||||
const Packet q0 = pset1<Packet>(3.6918986189438030e-01);
|
||||
const Packet q2 = pset1<Packet>(-1.0000000000000000e+00);
|
||||
const Packet q4 = pset1<Packet>(9.8733495886883648e-01);
|
||||
const Packet q6 = pset1<Packet>(-4.2828141436397615e-01);
|
||||
const Packet q8 = pset1<Packet>(7.6391885763341910e-02);
|
||||
const Packet q10 = pset1<Packet>(-3.8679974580640881e-03);
|
||||
constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02, -4.2828141436397615e-01,
|
||||
9.8733495886883648e-01, -1.0000000000000000e+00, 3.6918986189438030e-01};
|
||||
|
||||
const Packet x2 = pmul(x, x);
|
||||
const Packet x3 = pmul(x, x2);
|
||||
Packet q = pmadd(q10, x2, q8);
|
||||
Packet p = pmadd(p8, x2, p6);
|
||||
q = pmadd(x2, q, q6);
|
||||
p = pmadd(x2, p, p4);
|
||||
q = pmadd(x2, q, q4);
|
||||
p = pmadd(x2, p, p2);
|
||||
q = pmadd(x2, q, q2);
|
||||
p = pmadd(x2, p, p0);
|
||||
q = pmadd(x2, q, q0);
|
||||
Packet p = ppolevl<Packet, 4>::run(x2, alpha);
|
||||
Packet q = ppolevl<Packet, 5>::run(x2, beta);
|
||||
Packet y_small = pmadd(x3, pdiv(p, q), x);
|
||||
|
||||
// For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
|
||||
@ -2200,134 +2230,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Pac
|
||||
pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
|
||||
}
|
||||
|
||||
/* polevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate polynomial
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N+1];
|
||||
*
|
||||
* y = polevl<decltype(x), N>( x, coef);
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates polynomial of degree N:
|
||||
*
|
||||
* 2 N
|
||||
* y = C + C x + C x +...+ C x
|
||||
* 0 1 2 N
|
||||
*
|
||||
* Coefficients are stored in reverse order:
|
||||
*
|
||||
* coef[0] = C , ..., coef[N] = C .
|
||||
* N 0
|
||||
*
|
||||
* The function p1evl() assumes that coef[N] = 1.0 and is
|
||||
* omitted from the array. Its calling arguments are
|
||||
* otherwise the same as polevl().
|
||||
*
|
||||
*
|
||||
* The Eigen implementation is templatized. For best speed, store
|
||||
* coef as a const array (constexpr), e.g.
|
||||
*
|
||||
* const double coef[] = {1.0, 2.0, 3.0, ...};
|
||||
*
|
||||
*/
|
||||
template <typename Packet, int N>
|
||||
struct ppolevl {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
||||
const typename unpacket_traits<Packet>::type coeff[]) {
|
||||
EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
struct ppolevl<Packet, 0> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
|
||||
const typename unpacket_traits<Packet>::type coeff[]) {
|
||||
EIGEN_UNUSED_VARIABLE(x);
|
||||
return pset1<Packet>(coeff[0]);
|
||||
}
|
||||
};
|
||||
|
||||
/* chbevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate Chebyshev series
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N], chebevl();
|
||||
*
|
||||
* y = chbevl( x, coef, N );
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates the series
|
||||
*
|
||||
* N-1
|
||||
* - '
|
||||
* y = > coef[i] T (x/2)
|
||||
* - i
|
||||
* i=0
|
||||
*
|
||||
* of Chebyshev polynomials Ti at argument x/2.
|
||||
*
|
||||
* Coefficients are stored in reverse order, i.e. the zero
|
||||
* order term is last in the array. Note N is the number of
|
||||
* coefficients, not the order.
|
||||
*
|
||||
* If coefficients are for the interval a to b, x must
|
||||
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
||||
* entering the routine. This maps x from (a, b) to (-1, 1),
|
||||
* over which the Chebyshev polynomials are defined.
|
||||
*
|
||||
* If the coefficients are for the inverted interval, in
|
||||
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
||||
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
||||
* this becomes x -> 4a/x - 1.
|
||||
*
|
||||
*
|
||||
*
|
||||
* SPEED:
|
||||
*
|
||||
* Taking advantage of the recurrence properties of the
|
||||
* Chebyshev polynomials, the routine requires one more
|
||||
* addition per loop than evaluating a nested polynomial of
|
||||
* the same degree.
|
||||
*
|
||||
*/
|
||||
|
||||
template <typename Packet, int N>
|
||||
struct pchebevl {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
|
||||
const typename unpacket_traits<Packet>::type coef[]) {
|
||||
typedef typename unpacket_traits<Packet>::type Scalar;
|
||||
Packet b0 = pset1<Packet>(coef[0]);
|
||||
Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
|
||||
Packet b2;
|
||||
|
||||
for (int i = 1; i < N; i++) {
|
||||
b2 = b1;
|
||||
b1 = b0;
|
||||
b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
|
||||
}
|
||||
|
||||
return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
|
||||
}
|
||||
};
|
||||
|
||||
namespace unary_pow {
|
||||
|
||||
template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
|
||||
|
@ -131,8 +131,8 @@ struct digamma_impl_maybe_poly {
|
||||
template <>
|
||||
struct digamma_impl_maybe_poly<float> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) {
|
||||
const float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
|
||||
8.33333333333333333333E-2f};
|
||||
constexpr float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
|
||||
8.33333333333333333333E-2f};
|
||||
|
||||
float z;
|
||||
if (s < 1.0e8f) {
|
||||
@ -146,9 +146,9 @@ struct digamma_impl_maybe_poly<float> {
|
||||
template <>
|
||||
struct digamma_impl_maybe_poly<double> {
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double s) {
|
||||
const double A[] = {8.33333333333333333333E-2, -2.10927960927960927961E-2, 7.57575757575757575758E-3,
|
||||
-4.16666666666666666667E-3, 3.96825396825396825397E-3, -8.33333333333333333333E-3,
|
||||
8.33333333333333333333E-2};
|
||||
constexpr double A[] = {8.33333333333333333333E-2, -2.10927960927960927961E-2, 7.57575757575757575758E-3,
|
||||
-4.16666666666666666667E-3, 3.96825396825396825397E-3, -8.33333333333333333333E-3,
|
||||
8.33333333333333333333E-2};
|
||||
|
||||
double z;
|
||||
if (s < 1.0e17) {
|
||||
@ -282,20 +282,14 @@ struct digamma_impl {
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
const T alpha_1 = pset1<T>(1.128379106521606445312500000000000000e+00f);
|
||||
const T alpha_3 = pset1<T>(1.874160766601562500000000000000000000e-01f);
|
||||
const T alpha_5 = pset1<T>(5.243302136659622192382812500000000000e-02f);
|
||||
const T alpha_7 = pset1<T>(3.658048342913389205932617187500000000e-03f);
|
||||
const T alpha_9 = pset1<T>(2.861979592125862836837768554687500000e-04f);
|
||||
const T alpha_11 = pset1<T>(2.123732201653183437883853912353515625e-06f);
|
||||
constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
|
||||
3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
|
||||
1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
|
||||
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
const T beta_0 = pset1<T>(1.0f);
|
||||
const T beta_2 = pset1<T>(4.99425798654556274414062500000e-01f);
|
||||
const T beta_4 = pset1<T>(1.12945675849914550781250000000e-01f);
|
||||
const T beta_6 = pset1<T>(1.47520881146192550659179687500e-02f);
|
||||
const T beta_8 = pset1<T>(1.14329601638019084930419921875e-03f);
|
||||
const T beta_10 = pset1<T>(3.89185734093189239501953125000e-05f);
|
||||
constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
|
||||
1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
|
||||
4.99425798654556274414062500000e-01f, 1.0f};
|
||||
|
||||
// Since the polynomials are odd/even, we need x^2.
|
||||
// Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
|
||||
@ -303,19 +297,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
|
||||
const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
|
||||
|
||||
// Evaluate the numerator polynomial p.
|
||||
T p = pmadd(x2, alpha_11, alpha_9);
|
||||
p = pmadd(x2, p, alpha_7);
|
||||
p = pmadd(x2, p, alpha_5);
|
||||
p = pmadd(x2, p, alpha_3);
|
||||
p = pmadd(x2, p, alpha_1);
|
||||
T p = ppolevl<T, 5>::run(x2, alpha);
|
||||
p = pmul(x, p);
|
||||
|
||||
// Evaluate the denominator polynomial p.
|
||||
T q = pmadd(x2, beta_10, beta_8);
|
||||
q = pmadd(x2, q, beta_6);
|
||||
q = pmadd(x2, q, beta_4);
|
||||
q = pmadd(x2, q, beta_2);
|
||||
q = pmadd(x2, q, beta_0);
|
||||
T q = ppolevl<T, 5>::run(x2, beta);
|
||||
const T r = pdiv(p, q);
|
||||
|
||||
// Clamp to [-1:1].
|
||||
@ -475,18 +461,18 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(const float& should_
|
||||
|
||||
template <typename T, typename ScalarType>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
|
||||
const ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
|
||||
ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
|
||||
ScalarType(-1.23916583867381258016e0)};
|
||||
const ScalarType q0[] = {ScalarType(1.0),
|
||||
ScalarType(1.95448858338141759834e0),
|
||||
ScalarType(4.67627912898881538453e0),
|
||||
ScalarType(8.63602421390890590575e1),
|
||||
ScalarType(-2.25462687854119370527e2),
|
||||
ScalarType(2.00260212380060660359e2),
|
||||
ScalarType(-8.20372256168333339912e1),
|
||||
ScalarType(1.59056225126211695515e1),
|
||||
ScalarType(-1.18331621121330003142e0)};
|
||||
constexpr ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
|
||||
ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
|
||||
ScalarType(-1.23916583867381258016e0)};
|
||||
constexpr ScalarType q0[] = {ScalarType(1.0),
|
||||
ScalarType(1.95448858338141759834e0),
|
||||
ScalarType(4.67627912898881538453e0),
|
||||
ScalarType(8.63602421390890590575e1),
|
||||
ScalarType(-2.25462687854119370527e2),
|
||||
ScalarType(2.00260212380060660359e2),
|
||||
ScalarType(-8.20372256168333339912e1),
|
||||
ScalarType(1.59056225126211695515e1),
|
||||
ScalarType(-1.18331621121330003142e0)};
|
||||
const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
|
||||
const T half = pset1<T>(ScalarType(0.5));
|
||||
T c, c2, ndtri_gt_exp_neg_two;
|
||||
@ -503,20 +489,20 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(const T& b,
|
||||
/* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
|
||||
* i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
|
||||
*/
|
||||
const ScalarType p1[] = {ScalarType(4.05544892305962419923e0), ScalarType(3.15251094599893866154e1),
|
||||
ScalarType(5.71628192246421288162e1), ScalarType(4.40805073893200834700e1),
|
||||
ScalarType(1.46849561928858024014e1), ScalarType(2.18663306850790267539e0),
|
||||
ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
|
||||
ScalarType(-8.57456785154685413611e-4)};
|
||||
const ScalarType q1[] = {ScalarType(1.0),
|
||||
ScalarType(1.57799883256466749731e1),
|
||||
ScalarType(4.53907635128879210584e1),
|
||||
ScalarType(4.13172038254672030440e1),
|
||||
ScalarType(1.50425385692907503408e1),
|
||||
ScalarType(2.50464946208309415979e0),
|
||||
ScalarType(-1.42182922854787788574e-1),
|
||||
ScalarType(-3.80806407691578277194e-2),
|
||||
ScalarType(-9.33259480895457427372e-4)};
|
||||
constexpr ScalarType p1[] = {ScalarType(4.05544892305962419923e0), ScalarType(3.15251094599893866154e1),
|
||||
ScalarType(5.71628192246421288162e1), ScalarType(4.40805073893200834700e1),
|
||||
ScalarType(1.46849561928858024014e1), ScalarType(2.18663306850790267539e0),
|
||||
ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
|
||||
ScalarType(-8.57456785154685413611e-4)};
|
||||
constexpr ScalarType q1[] = {ScalarType(1.0),
|
||||
ScalarType(1.57799883256466749731e1),
|
||||
ScalarType(4.53907635128879210584e1),
|
||||
ScalarType(4.13172038254672030440e1),
|
||||
ScalarType(1.50425385692907503408e1),
|
||||
ScalarType(2.50464946208309415979e0),
|
||||
ScalarType(-1.42182922854787788574e-1),
|
||||
ScalarType(-3.80806407691578277194e-2),
|
||||
ScalarType(-9.33259480895457427372e-4)};
|
||||
/* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
|
||||
* i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
|
||||
*/
|
||||
@ -1267,7 +1253,7 @@ struct zeta_impl {
|
||||
int i;
|
||||
Scalar p, r, a, b, k, s, t, w;
|
||||
|
||||
const Scalar A[] = {
|
||||
constexpr Scalar A[] = {
|
||||
Scalar(12.0),
|
||||
Scalar(-720.0),
|
||||
Scalar(30240.0),
|
||||
|
Loading…
x
Reference in New Issue
Block a user