mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 10:44:02 +08:00
Add packetized versions of i0e and i1e special functions.
- In particular refactor the i0e and i1e code so scalar and vectorized path share code. - Move chebevl to GenericPacketMathFunctions. A brief benchmark with building Eigen with FMA, AVX and AVX2 flags Before: CPU: Intel Haswell with HyperThreading (6 cores) Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- BM_eigen_i0e_double/1 57.3 57.3 10000000 BM_eigen_i0e_double/8 398 398 1748554 BM_eigen_i0e_double/64 3184 3184 218961 BM_eigen_i0e_double/512 25579 25579 27330 BM_eigen_i0e_double/4k 205043 205042 3418 BM_eigen_i0e_double/32k 1646038 1646176 422 BM_eigen_i0e_double/256k 13180959 13182613 53 BM_eigen_i0e_double/1M 52684617 52706132 10 BM_eigen_i0e_float/1 28.4 28.4 24636711 BM_eigen_i0e_float/8 75.7 75.7 9207634 BM_eigen_i0e_float/64 512 512 1000000 BM_eigen_i0e_float/512 4194 4194 166359 BM_eigen_i0e_float/4k 32756 32761 21373 BM_eigen_i0e_float/32k 261133 261153 2678 BM_eigen_i0e_float/256k 2087938 2088231 333 BM_eigen_i0e_float/1M 8380409 8381234 84 BM_eigen_i1e_double/1 56.3 56.3 10000000 BM_eigen_i1e_double/8 397 397 1772376 BM_eigen_i1e_double/64 3114 3115 223881 BM_eigen_i1e_double/512 25358 25361 27761 BM_eigen_i1e_double/4k 203543 203593 3462 BM_eigen_i1e_double/32k 1613649 1613803 428 BM_eigen_i1e_double/256k 12910625 12910374 54 BM_eigen_i1e_double/1M 51723824 51723991 10 BM_eigen_i1e_float/1 28.3 28.3 24683049 BM_eigen_i1e_float/8 74.8 74.9 9366216 BM_eigen_i1e_float/64 505 505 1000000 BM_eigen_i1e_float/512 4068 4068 171690 BM_eigen_i1e_float/4k 31803 31806 21948 BM_eigen_i1e_float/32k 253637 253692 2763 BM_eigen_i1e_float/256k 2019711 2019918 346 BM_eigen_i1e_float/1M 8238681 8238713 86 After: CPU: Intel Haswell with HyperThreading (6 cores) Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- BM_eigen_i0e_double/1 15.8 15.8 44097476 BM_eigen_i0e_double/8 99.3 99.3 7014884 BM_eigen_i0e_double/64 777 777 886612 BM_eigen_i0e_double/512 6180 6181 100000 BM_eigen_i0e_double/4k 48136 48140 14678 BM_eigen_i0e_double/32k 385936 385943 1801 BM_eigen_i0e_double/256k 3293324 3293551 228 BM_eigen_i0e_double/1M 12423600 12424458 57 BM_eigen_i0e_float/1 16.3 16.3 43038042 BM_eigen_i0e_float/8 30.1 30.1 23456931 BM_eigen_i0e_float/64 169 169 4132875 BM_eigen_i0e_float/512 1338 1339 516860 BM_eigen_i0e_float/4k 10191 10191 68513 BM_eigen_i0e_float/32k 81338 81337 8531 BM_eigen_i0e_float/256k 651807 651984 1000 BM_eigen_i0e_float/1M 2633821 2634187 268 BM_eigen_i1e_double/1 16.2 16.2 42352499 BM_eigen_i1e_double/8 110 110 6316524 BM_eigen_i1e_double/64 822 822 851065 BM_eigen_i1e_double/512 6480 6481 100000 BM_eigen_i1e_double/4k 51843 51843 10000 BM_eigen_i1e_double/32k 414854 414852 1680 BM_eigen_i1e_double/256k 3320001 3320568 212 BM_eigen_i1e_double/1M 13442795 13442391 53 BM_eigen_i1e_float/1 17.6 17.6 41025735 BM_eigen_i1e_float/8 35.5 35.5 19597891 BM_eigen_i1e_float/64 240 240 2924237 BM_eigen_i1e_float/512 1424 1424 485953 BM_eigen_i1e_float/4k 10722 10723 65162 BM_eigen_i1e_float/32k 86286 86297 8048 BM_eigen_i1e_float/256k 691821 691868 1000 BM_eigen_i1e_float/1M 2777336 2777747 256 This shows anywhere from a 50% to 75% improvement on these operations. I've also benchmarked without any of these flags turned on, and got similar performance to before (if not better). Also tested packetmath.cpp + special_functions to ensure no regressions.
This commit is contained in:
parent
b052ec6992
commit
facdec5aa7
@ -73,6 +73,8 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasExpm1 = 1,
|
||||
HasExp = 1,
|
||||
HasNdtri = 1,
|
||||
HasI0e = 1,
|
||||
HasI1e = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
|
@ -99,6 +99,8 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasExpm1 = 1,
|
||||
HasNdtri = 1,
|
||||
#endif
|
||||
HasI0e = 1,
|
||||
HasI1e = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = EIGEN_FAST_MATH,
|
||||
HasRsqrt = EIGEN_FAST_MATH,
|
||||
|
@ -570,6 +570,97 @@ struct ppolevl<Packet, 0> {
|
||||
}
|
||||
};
|
||||
|
||||
/* chbevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate Chebyshev series
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N], chebevl();
|
||||
*
|
||||
* y = chbevl( x, coef, N );
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates the series
|
||||
*
|
||||
* N-1
|
||||
* - '
|
||||
* y = > coef[i] T (x/2)
|
||||
* - i
|
||||
* i=0
|
||||
*
|
||||
* of Chebyshev polynomials Ti at argument x/2.
|
||||
*
|
||||
* Coefficients are stored in reverse order, i.e. the zero
|
||||
* order term is last in the array. Note N is the number of
|
||||
* coefficients, not the order.
|
||||
*
|
||||
* If coefficients are for the interval a to b, x must
|
||||
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
||||
* entering the routine. This maps x from (a, b) to (-1, 1),
|
||||
* over which the Chebyshev polynomials are defined.
|
||||
*
|
||||
* If the coefficients are for the inverted interval, in
|
||||
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
||||
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
||||
* this becomes x -> 4a/x - 1.
|
||||
*
|
||||
*
|
||||
*
|
||||
* SPEED:
|
||||
*
|
||||
* Taking advantage of the recurrence properties of the
|
||||
* Chebyshev polynomials, the routine requires one more
|
||||
* addition per loop than evaluating a nested polynomial of
|
||||
* the same degree.
|
||||
*
|
||||
*/
|
||||
template <typename Packet, int N>
|
||||
struct generic_cheb_recurrence {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
|
||||
EIGEN_STATIC_ASSERT((N > 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
return pmadd(
|
||||
generic_cheb_recurrence<Packet, N - 1>::run(x, coef), x,
|
||||
psub(pset1<Packet>(coef[N - 1]), generic_cheb_recurrence<Packet, N -
|
||||
2>::run(x, coef)));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
struct generic_cheb_recurrence<Packet, 2> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
|
||||
return pmadd(pset1<Packet>(coef[0]), x, pset1<Packet>(coef[1]));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
struct generic_cheb_recurrence<Packet, 1> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
|
||||
EIGEN_UNUSED_VARIABLE(x);
|
||||
return pset1<Packet>(coef[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Packet, int N>
|
||||
struct pchebevl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
|
||||
const Packet half = pset1<Packet>(0.5);
|
||||
return pmul(half, psub(
|
||||
generic_cheb_recurrence<Packet, N>::run(x, coef),
|
||||
generic_cheb_recurrence<Packet, N - 2>::run(x, coef)));
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
|
@ -114,7 +114,8 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasExpm1 = 1,
|
||||
HasNdtri = 1,
|
||||
HasExp = 1,
|
||||
HasNdtri = 1,
|
||||
HasI0e = 1,
|
||||
HasI1e = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
|
@ -215,6 +215,8 @@ template<typename Scalar> struct scalar_digamma_op;
|
||||
template<typename Scalar> struct scalar_erf_op;
|
||||
template<typename Scalar> struct scalar_erfc_op;
|
||||
template<typename Scalar> struct scalar_ndtri_op;
|
||||
template<typename Scalar> struct scalar_i0e_op;
|
||||
template<typename Scalar> struct scalar_i1e_op;
|
||||
template<typename Scalar> struct scalar_igamma_op;
|
||||
template<typename Scalar> struct scalar_igammac_op;
|
||||
template<typename Scalar> struct scalar_zeta_op;
|
||||
|
@ -609,6 +609,8 @@ template<typename Scalar,typename Packet> void packetmath_real()
|
||||
CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasI0e, numext::i0e, internal::pi0e);
|
||||
CHECK_CWISE1_IF(PacketTraits::HasI1e, numext::i1e, internal::pi1e);
|
||||
#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
|
||||
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
|
||||
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
|
||||
|
@ -36,78 +36,6 @@ namespace internal {
|
||||
// Good luck with your project,
|
||||
// Steve
|
||||
|
||||
namespace cephes {
|
||||
|
||||
/* chbevl (modified for Eigen)
|
||||
*
|
||||
* Evaluate Chebyshev series
|
||||
*
|
||||
*
|
||||
*
|
||||
* SYNOPSIS:
|
||||
*
|
||||
* int N;
|
||||
* Scalar x, y, coef[N], chebevl();
|
||||
*
|
||||
* y = chbevl( x, coef, N );
|
||||
*
|
||||
*
|
||||
*
|
||||
* DESCRIPTION:
|
||||
*
|
||||
* Evaluates the series
|
||||
*
|
||||
* N-1
|
||||
* - '
|
||||
* y = > coef[i] T (x/2)
|
||||
* - i
|
||||
* i=0
|
||||
*
|
||||
* of Chebyshev polynomials Ti at argument x/2.
|
||||
*
|
||||
* Coefficients are stored in reverse order, i.e. the zero
|
||||
* order term is last in the array. Note N is the number of
|
||||
* coefficients, not the order.
|
||||
*
|
||||
* If coefficients are for the interval a to b, x must
|
||||
* have been transformed to x -> 2(2x - b - a)/(b-a) before
|
||||
* entering the routine. This maps x from (a, b) to (-1, 1),
|
||||
* over which the Chebyshev polynomials are defined.
|
||||
*
|
||||
* If the coefficients are for the inverted interval, in
|
||||
* which (a, b) is mapped to (1/b, 1/a), the transformation
|
||||
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
|
||||
* this becomes x -> 4a/x - 1.
|
||||
*
|
||||
*
|
||||
*
|
||||
* SPEED:
|
||||
*
|
||||
* Taking advantage of the recurrence properties of the
|
||||
* Chebyshev polynomials, the routine requires one more
|
||||
* addition per loop than evaluating a nested polynomial of
|
||||
* the same degree.
|
||||
*
|
||||
*/
|
||||
template <typename Scalar, int N>
|
||||
struct chebevl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(Scalar x, const Scalar coef[]) {
|
||||
Scalar b0 = coef[0];
|
||||
Scalar b1 = 0;
|
||||
Scalar b2;
|
||||
|
||||
for (int i = 1; i < N; i++) {
|
||||
b2 = b1;
|
||||
b1 = b0;
|
||||
b0 = x * b1 - b2 + coef[i];
|
||||
}
|
||||
|
||||
return Scalar(0.5) * (b0 - b2);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace cephes
|
||||
|
||||
/****************************************************************************
|
||||
* Implementation of lgamma, requires C++11/C99 *
|
||||
@ -1945,20 +1873,20 @@ struct i0e_retval {
|
||||
typedef Scalar type;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct i0e_impl {
|
||||
template <typename T, typename ScalarType>
|
||||
struct generic_i0e {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
|
||||
static EIGEN_STRONG_INLINE T run(const T&) {
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
|
||||
THIS_TYPE_IS_NOT_SUPPORTED);
|
||||
return Scalar(0);
|
||||
return ScalarType(0);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct i0e_impl<float> {
|
||||
template <typename T>
|
||||
struct generic_i0e<T, float> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE float run(float x) {
|
||||
static EIGEN_STRONG_INLINE T run(const T& x) {
|
||||
/* i0ef.c
|
||||
*
|
||||
* Modified Bessel function of order zero,
|
||||
@ -1991,6 +1919,7 @@ struct i0e_impl<float> {
|
||||
* See i0f().
|
||||
*
|
||||
*/
|
||||
|
||||
const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
|
||||
-2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
|
||||
-4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
|
||||
@ -2005,23 +1934,24 @@ struct i0e_impl<float> {
|
||||
2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
|
||||
6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
|
||||
8.04490411014108831608E-1f};
|
||||
if (x < 0.0f) {
|
||||
x = -x;
|
||||
}
|
||||
|
||||
if (x <= 8.0f) {
|
||||
float y = 0.5f * x - 2.0f;
|
||||
return cephes::chebevl<float, 18>::run(y, A);
|
||||
}
|
||||
|
||||
return cephes::chebevl<float, 7>::run(32.0f / x - 2.0f, B) / numext::sqrt(x);
|
||||
T y = pabs(x);
|
||||
T y_le_eight = internal::pchebevl<T, 18>::run(
|
||||
pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
|
||||
T y_gt_eight = pdiv(
|
||||
internal::pchebevl<T, 7>::run(
|
||||
psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
|
||||
psqrt(y));
|
||||
// TODO: Perhaps instead check whether all packet elements are in
|
||||
// [-8, 8] and evaluate a branch based off of that. It's possible
|
||||
// in practice most elements are in this region.
|
||||
return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct i0e_impl<double> {
|
||||
template <typename T>
|
||||
struct generic_i0e<T, double> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE double run(double x) {
|
||||
static EIGEN_STRONG_INLINE T run(const T& x) {
|
||||
/* i0e.c
|
||||
*
|
||||
* Modified Bessel function of order zero,
|
||||
@ -2054,6 +1984,7 @@ struct i0e_impl<double> {
|
||||
* See i0().
|
||||
*
|
||||
*/
|
||||
|
||||
const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
|
||||
-2.43127984654795469359E-16, 1.71539128555513303061E-15,
|
||||
-1.16853328779934516808E-14, 7.67618549860493561688E-14,
|
||||
@ -2083,40 +2014,48 @@ struct i0e_impl<double> {
|
||||
2.04891858946906374183E-7, 2.89137052083475648297E-6,
|
||||
6.88975834691682398426E-5, 3.36911647825569408990E-3,
|
||||
8.04490411014108831608E-1};
|
||||
|
||||
if (x < 0.0) {
|
||||
x = -x;
|
||||
}
|
||||
|
||||
if (x <= 8.0) {
|
||||
double y = (x / 2.0) - 2.0;
|
||||
return cephes::chebevl<double, 30>::run(y, A);
|
||||
}
|
||||
|
||||
return cephes::chebevl<double, 25>::run(32.0 / x - 2.0, B) /
|
||||
numext::sqrt(x);
|
||||
T y = pabs(x);
|
||||
T y_le_eight = internal::pchebevl<T, 30>::run(
|
||||
pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
|
||||
T y_gt_eight = pdiv(
|
||||
internal::pchebevl<T, 25>::run(
|
||||
psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
|
||||
psqrt(y));
|
||||
// TODO: Perhaps instead check whether all packet elements are in
|
||||
// [-8, 8] and evaluate a branch based off of that. It's possible
|
||||
// in practice most elements are in this region.
|
||||
return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct i0e_impl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
|
||||
return generic_i0e<Scalar, Scalar>::run(x);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename Scalar>
|
||||
struct i1e_retval {
|
||||
typedef Scalar type;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct i1e_impl {
|
||||
template <typename T, typename ScalarType>
|
||||
struct generic_i1e {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
|
||||
static EIGEN_STRONG_INLINE T run(const T&) {
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
|
||||
THIS_TYPE_IS_NOT_SUPPORTED);
|
||||
return Scalar(0);
|
||||
return ScalarType(0);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct i1e_impl<float> {
|
||||
template <typename T>
|
||||
struct generic_i1e<T, float> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE float run(float x) {
|
||||
static EIGEN_STRONG_INLINE T run(const T& x) {
|
||||
/* i1ef.c
|
||||
*
|
||||
* Modified Bessel function of order one,
|
||||
@ -2164,27 +2103,27 @@ struct i1e_impl<float> {
|
||||
-1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
|
||||
7.78576235018280120474E-1f};
|
||||
|
||||
float z = numext::abs(x);
|
||||
|
||||
if (z <= 8.0f) {
|
||||
float y = 0.5f * z - 2.0f;
|
||||
z = cephes::chebevl<float, 17>::run(y, A) * z;
|
||||
} else {
|
||||
z = cephes::chebevl<float, 7>::run(32.0f / z - 2.0f, B) / numext::sqrt(z);
|
||||
}
|
||||
|
||||
if (x < 0.0f) {
|
||||
z = -z;
|
||||
}
|
||||
|
||||
return z;
|
||||
T y = pabs(x);
|
||||
T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
|
||||
pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
|
||||
T y_gt_eight = pdiv(
|
||||
internal::pchebevl<T, 7>::run(
|
||||
psub(pdiv(pset1<T>(32.0f), y),
|
||||
pset1<T>(2.0f)), B),
|
||||
psqrt(y));
|
||||
// TODO: Perhaps instead check whether all packet elements are in
|
||||
// [-8, 8] and evaluate a branch based off of that. It's possible
|
||||
// in practice most elements are in this region.
|
||||
y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
|
||||
return pselect(pcmp_lt(x, pset1<T>(0.0f)), -y, y);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct i1e_impl<double> {
|
||||
template <typename T>
|
||||
struct generic_i1e<T, double> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE double run(double x) {
|
||||
static EIGEN_STRONG_INLINE T run(const T& x) {
|
||||
/* i1e.c
|
||||
*
|
||||
* Modified Bessel function of order one,
|
||||
@ -2246,21 +2185,27 @@ struct i1e_impl<double> {
|
||||
-2.51223623787020892529E-7, -3.88256480887769039346E-6,
|
||||
-1.10588938762623716291E-4, -9.76109749136146840777E-3,
|
||||
7.78576235018280120474E-1};
|
||||
T y = pabs(x);
|
||||
T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
|
||||
pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
|
||||
T y_gt_eight = pdiv(
|
||||
internal::pchebevl<T, 25>::run(
|
||||
psub(pdiv(pset1<T>(32.0), y),
|
||||
pset1<T>(2.0)), B),
|
||||
psqrt(y));
|
||||
// TODO: Perhaps instead check whether all packet elements are in
|
||||
// [-8, 8] and evaluate a branch based off of that. It's possible
|
||||
// in practice most elements are in this region.
|
||||
y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
|
||||
return pselect(pcmp_lt(x, pset1<T>(0.0f)), -y, y);
|
||||
}
|
||||
};
|
||||
|
||||
double z = numext::abs(x);
|
||||
|
||||
if (z <= 8.0) {
|
||||
double y = (z / 2.0) - 2.0;
|
||||
z = cephes::chebevl<double, 29>::run(y, A) * z;
|
||||
} else {
|
||||
z = cephes::chebevl<double, 25>::run(32.0 / z - 2.0, B) / numext::sqrt(z);
|
||||
}
|
||||
|
||||
if (x < 0.0) {
|
||||
z = -z;
|
||||
}
|
||||
|
||||
return z;
|
||||
template <typename Scalar>
|
||||
struct i1e_impl {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
|
||||
return generic_i1e<Scalar, Scalar>::run(x);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -76,13 +76,19 @@ Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext
|
||||
* order zero i0e(\a a) (coeff-wise) */
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pi0e(const Packet& x) { using numext::i0e; return i0e(x); }
|
||||
Packet pi0e(const Packet& x) {
|
||||
typedef typename unpacket_traits<Packet>::type ScalarType;
|
||||
using internal::generic_i0e; return generic_i0e<Packet, ScalarType>::run(x);
|
||||
}
|
||||
|
||||
/** \internal \returns the exponentially scaled modified Bessel function of
|
||||
* order one i1e(\a a) (coeff-wise) */
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet pi1e(const Packet& x) { using numext::i1e; return i1e(x); }
|
||||
Packet pi1e(const Packet& x) {
|
||||
typedef typename unpacket_traits<Packet>::type ScalarType;
|
||||
using internal::generic_i1e; return generic_i1e<Packet, ScalarType>::run(x);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user