Add packetized versions of i0e and i1e special functions.

- In particular refactor the i0e and i1e code so scalar and vectorized path share code.
  - Move chebevl to GenericPacketMathFunctions.


A brief benchmark with building Eigen with FMA, AVX and AVX2 flags

Before:

CPU: Intel Haswell with HyperThreading (6 cores)
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
BM_eigen_i0e_double/1            57.3           57.3     10000000
BM_eigen_i0e_double/8           398            398        1748554
BM_eigen_i0e_double/64         3184           3184         218961
BM_eigen_i0e_double/512       25579          25579          27330
BM_eigen_i0e_double/4k       205043         205042           3418
BM_eigen_i0e_double/32k     1646038        1646176            422
BM_eigen_i0e_double/256k   13180959       13182613             53
BM_eigen_i0e_double/1M     52684617       52706132             10
BM_eigen_i0e_float/1             28.4           28.4     24636711
BM_eigen_i0e_float/8             75.7           75.7      9207634
BM_eigen_i0e_float/64           512            512        1000000
BM_eigen_i0e_float/512         4194           4194         166359
BM_eigen_i0e_float/4k         32756          32761          21373
BM_eigen_i0e_float/32k       261133         261153           2678
BM_eigen_i0e_float/256k     2087938        2088231            333
BM_eigen_i0e_float/1M       8380409        8381234             84
BM_eigen_i1e_double/1            56.3           56.3     10000000
BM_eigen_i1e_double/8           397            397        1772376
BM_eigen_i1e_double/64         3114           3115         223881
BM_eigen_i1e_double/512       25358          25361          27761
BM_eigen_i1e_double/4k       203543         203593           3462
BM_eigen_i1e_double/32k     1613649        1613803            428
BM_eigen_i1e_double/256k   12910625       12910374             54
BM_eigen_i1e_double/1M     51723824       51723991             10
BM_eigen_i1e_float/1             28.3           28.3     24683049
BM_eigen_i1e_float/8             74.8           74.9      9366216
BM_eigen_i1e_float/64           505            505        1000000
BM_eigen_i1e_float/512         4068           4068         171690
BM_eigen_i1e_float/4k         31803          31806          21948
BM_eigen_i1e_float/32k       253637         253692           2763
BM_eigen_i1e_float/256k     2019711        2019918            346
BM_eigen_i1e_float/1M       8238681        8238713             86


After:

CPU: Intel Haswell with HyperThreading (6 cores)
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
BM_eigen_i0e_double/1            15.8           15.8     44097476
BM_eigen_i0e_double/8            99.3           99.3      7014884
BM_eigen_i0e_double/64          777            777         886612
BM_eigen_i0e_double/512        6180           6181         100000
BM_eigen_i0e_double/4k        48136          48140          14678
BM_eigen_i0e_double/32k      385936         385943           1801
BM_eigen_i0e_double/256k    3293324        3293551            228
BM_eigen_i0e_double/1M     12423600       12424458             57
BM_eigen_i0e_float/1             16.3           16.3     43038042
BM_eigen_i0e_float/8             30.1           30.1     23456931
BM_eigen_i0e_float/64           169            169        4132875
BM_eigen_i0e_float/512         1338           1339         516860
BM_eigen_i0e_float/4k         10191          10191          68513
BM_eigen_i0e_float/32k        81338          81337           8531
BM_eigen_i0e_float/256k      651807         651984           1000
BM_eigen_i0e_float/1M       2633821        2634187            268
BM_eigen_i1e_double/1            16.2           16.2     42352499
BM_eigen_i1e_double/8           110            110        6316524
BM_eigen_i1e_double/64          822            822         851065
BM_eigen_i1e_double/512        6480           6481         100000
BM_eigen_i1e_double/4k        51843          51843          10000
BM_eigen_i1e_double/32k      414854         414852           1680
BM_eigen_i1e_double/256k    3320001        3320568            212
BM_eigen_i1e_double/1M     13442795       13442391             53
BM_eigen_i1e_float/1             17.6           17.6     41025735
BM_eigen_i1e_float/8             35.5           35.5     19597891
BM_eigen_i1e_float/64           240            240        2924237
BM_eigen_i1e_float/512         1424           1424         485953
BM_eigen_i1e_float/4k         10722          10723          65162
BM_eigen_i1e_float/32k        86286          86297           8048
BM_eigen_i1e_float/256k      691821         691868           1000
BM_eigen_i1e_float/1M       2777336        2777747            256


This shows anywhere from a 50% to 75% improvement on these operations.

I've also benchmarked without any of these flags turned on, and got similar
performance to before (if not better).

Also tested packetmath.cpp + special_functions to ensure no regressions.
This commit is contained in:
Srinivas Vasudevan 2019-09-11 18:34:02 -07:00
parent b052ec6992
commit facdec5aa7
8 changed files with 197 additions and 146 deletions

View File

@ -73,6 +73,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasExpm1 = 1,
HasExp = 1,
HasNdtri = 1,
HasI0e = 1,
HasI1e = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,

View File

@ -99,6 +99,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasExpm1 = 1,
HasNdtri = 1,
#endif
HasI0e = 1,
HasI1e = 1,
HasExp = 1,
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,

View File

@ -570,6 +570,97 @@ struct ppolevl<Packet, 0> {
}
};
/* chbevl (modified for Eigen)
*
* Evaluate Chebyshev series
*
*
*
* SYNOPSIS:
*
* int N;
* Scalar x, y, coef[N], chebevl();
*
* y = chbevl( x, coef, N );
*
*
*
* DESCRIPTION:
*
* Evaluates the series
*
* N-1
* - '
* y = > coef[i] T (x/2)
* - i
* i=0
*
* of Chebyshev polynomials Ti at argument x/2.
*
* Coefficients are stored in reverse order, i.e. the zero
* order term is last in the array. Note N is the number of
* coefficients, not the order.
*
* If coefficients are for the interval a to b, x must
* have been transformed to x -> 2(2x - b - a)/(b-a) before
* entering the routine. This maps x from (a, b) to (-1, 1),
* over which the Chebyshev polynomials are defined.
*
* If the coefficients are for the inverted interval, in
* which (a, b) is mapped to (1/b, 1/a), the transformation
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
* this becomes x -> 4a/x - 1.
*
*
*
* SPEED:
*
* Taking advantage of the recurrence properties of the
* Chebyshev polynomials, the routine requires one more
* addition per loop than evaluating a nested polynomial of
* the same degree.
*
*/
template <typename Packet, int N>
struct generic_cheb_recurrence {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
EIGEN_STATIC_ASSERT((N > 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
return pmadd(
generic_cheb_recurrence<Packet, N - 1>::run(x, coef), x,
psub(pset1<Packet>(coef[N - 1]), generic_cheb_recurrence<Packet, N -
2>::run(x, coef)));
}
};
template <typename Packet>
struct generic_cheb_recurrence<Packet, 2> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
return pmadd(pset1<Packet>(coef[0]), x, pset1<Packet>(coef[1]));
}
};
template <typename Packet>
struct generic_cheb_recurrence<Packet, 1> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
EIGEN_UNUSED_VARIABLE(x);
return pset1<Packet>(coef[0]);
}
};
template <typename Packet, int N>
struct pchebevl {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
const Packet half = pset1<Packet>(0.5);
return pmul(half, psub(
generic_cheb_recurrence<Packet, N>::run(x, coef),
generic_cheb_recurrence<Packet, N - 2>::run(x, coef)));
}
};
} // end namespace internal
} // end namespace Eigen

View File

@ -114,7 +114,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasExpm1 = 1,
HasNdtri = 1,
HasExp = 1,
HasNdtri = 1,
HasI0e = 1,
HasI1e = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,

View File

@ -215,6 +215,8 @@ template<typename Scalar> struct scalar_digamma_op;
template<typename Scalar> struct scalar_erf_op;
template<typename Scalar> struct scalar_erfc_op;
template<typename Scalar> struct scalar_ndtri_op;
template<typename Scalar> struct scalar_i0e_op;
template<typename Scalar> struct scalar_i1e_op;
template<typename Scalar> struct scalar_igamma_op;
template<typename Scalar> struct scalar_igammac_op;
template<typename Scalar> struct scalar_zeta_op;

View File

@ -609,6 +609,8 @@ template<typename Scalar,typename Packet> void packetmath_real()
CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt);
CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
CHECK_CWISE1_IF(PacketTraits::HasI0e, numext::i0e, internal::pi0e);
CHECK_CWISE1_IF(PacketTraits::HasI1e, numext::i1e, internal::pi1e);
#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);

View File

@ -36,78 +36,6 @@ namespace internal {
// Good luck with your project,
// Steve
namespace cephes {
/* chbevl (modified for Eigen)
*
* Evaluate Chebyshev series
*
*
*
* SYNOPSIS:
*
* int N;
* Scalar x, y, coef[N], chebevl();
*
* y = chbevl( x, coef, N );
*
*
*
* DESCRIPTION:
*
* Evaluates the series
*
* N-1
* - '
* y = > coef[i] T (x/2)
* - i
* i=0
*
* of Chebyshev polynomials Ti at argument x/2.
*
* Coefficients are stored in reverse order, i.e. the zero
* order term is last in the array. Note N is the number of
* coefficients, not the order.
*
* If coefficients are for the interval a to b, x must
* have been transformed to x -> 2(2x - b - a)/(b-a) before
* entering the routine. This maps x from (a, b) to (-1, 1),
* over which the Chebyshev polynomials are defined.
*
* If the coefficients are for the inverted interval, in
* which (a, b) is mapped to (1/b, 1/a), the transformation
* required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
* this becomes x -> 4a/x - 1.
*
*
*
* SPEED:
*
* Taking advantage of the recurrence properties of the
* Chebyshev polynomials, the routine requires one more
* addition per loop than evaluating a nested polynomial of
* the same degree.
*
*/
template <typename Scalar, int N>
struct chebevl {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(Scalar x, const Scalar coef[]) {
Scalar b0 = coef[0];
Scalar b1 = 0;
Scalar b2;
for (int i = 1; i < N; i++) {
b2 = b1;
b1 = b0;
b0 = x * b1 - b2 + coef[i];
}
return Scalar(0.5) * (b0 - b2);
}
};
} // end namespace cephes
/****************************************************************************
* Implementation of lgamma, requires C++11/C99 *
@ -1945,20 +1873,20 @@ struct i0e_retval {
typedef Scalar type;
};
template <typename Scalar>
struct i0e_impl {
template <typename T, typename ScalarType>
struct generic_i0e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
static EIGEN_STRONG_INLINE T run(const T&) {
EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
THIS_TYPE_IS_NOT_SUPPORTED);
return Scalar(0);
return ScalarType(0);
}
};
template <>
struct i0e_impl<float> {
template <typename T>
struct generic_i0e<T, float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
static EIGEN_STRONG_INLINE T run(const T& x) {
/* i0ef.c
*
* Modified Bessel function of order zero,
@ -1991,6 +1919,7 @@ struct i0e_impl<float> {
* See i0f().
*
*/
const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
-2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
-4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
@ -2005,23 +1934,24 @@ struct i0e_impl<float> {
2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
8.04490411014108831608E-1f};
if (x < 0.0f) {
x = -x;
}
if (x <= 8.0f) {
float y = 0.5f * x - 2.0f;
return cephes::chebevl<float, 18>::run(y, A);
}
return cephes::chebevl<float, 7>::run(32.0f / x - 2.0f, B) / numext::sqrt(x);
T y = pabs(x);
T y_le_eight = internal::pchebevl<T, 18>::run(
pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
T y_gt_eight = pdiv(
internal::pchebevl<T, 7>::run(
psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
psqrt(y));
// TODO: Perhaps instead check whether all packet elements are in
// [-8, 8] and evaluate a branch based off of that. It's possible
// in practice most elements are in this region.
return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
}
};
template <>
struct i0e_impl<double> {
template <typename T>
struct generic_i0e<T, double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
static EIGEN_STRONG_INLINE T run(const T& x) {
/* i0e.c
*
* Modified Bessel function of order zero,
@ -2054,6 +1984,7 @@ struct i0e_impl<double> {
* See i0().
*
*/
const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
-2.43127984654795469359E-16, 1.71539128555513303061E-15,
-1.16853328779934516808E-14, 7.67618549860493561688E-14,
@ -2083,40 +2014,48 @@ struct i0e_impl<double> {
2.04891858946906374183E-7, 2.89137052083475648297E-6,
6.88975834691682398426E-5, 3.36911647825569408990E-3,
8.04490411014108831608E-1};
if (x < 0.0) {
x = -x;
}
if (x <= 8.0) {
double y = (x / 2.0) - 2.0;
return cephes::chebevl<double, 30>::run(y, A);
}
return cephes::chebevl<double, 25>::run(32.0 / x - 2.0, B) /
numext::sqrt(x);
T y = pabs(x);
T y_le_eight = internal::pchebevl<T, 30>::run(
pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
T y_gt_eight = pdiv(
internal::pchebevl<T, 25>::run(
psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
psqrt(y));
// TODO: Perhaps instead check whether all packet elements are in
// [-8, 8] and evaluate a branch based off of that. It's possible
// in practice most elements are in this region.
return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
}
};
template <typename Scalar>
struct i0e_impl {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
return generic_i0e<Scalar, Scalar>::run(x);
}
};
template <typename Scalar>
struct i1e_retval {
typedef Scalar type;
};
template <typename Scalar>
struct i1e_impl {
template <typename T, typename ScalarType>
struct generic_i1e {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
static EIGEN_STRONG_INLINE T run(const T&) {
EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
THIS_TYPE_IS_NOT_SUPPORTED);
return Scalar(0);
return ScalarType(0);
}
};
template <>
struct i1e_impl<float> {
template <typename T>
struct generic_i1e<T, float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
static EIGEN_STRONG_INLINE T run(const T& x) {
/* i1ef.c
*
* Modified Bessel function of order one,
@ -2164,27 +2103,27 @@ struct i1e_impl<float> {
-1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
7.78576235018280120474E-1f};
float z = numext::abs(x);
if (z <= 8.0f) {
float y = 0.5f * z - 2.0f;
z = cephes::chebevl<float, 17>::run(y, A) * z;
} else {
z = cephes::chebevl<float, 7>::run(32.0f / z - 2.0f, B) / numext::sqrt(z);
}
if (x < 0.0f) {
z = -z;
}
return z;
T y = pabs(x);
T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
T y_gt_eight = pdiv(
internal::pchebevl<T, 7>::run(
psub(pdiv(pset1<T>(32.0f), y),
pset1<T>(2.0f)), B),
psqrt(y));
// TODO: Perhaps instead check whether all packet elements are in
// [-8, 8] and evaluate a branch based off of that. It's possible
// in practice most elements are in this region.
y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
return pselect(pcmp_lt(x, pset1<T>(0.0f)), -y, y);
}
};
template <>
struct i1e_impl<double> {
template <typename T>
struct generic_i1e<T, double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
static EIGEN_STRONG_INLINE T run(const T& x) {
/* i1e.c
*
* Modified Bessel function of order one,
@ -2246,21 +2185,27 @@ struct i1e_impl<double> {
-2.51223623787020892529E-7, -3.88256480887769039346E-6,
-1.10588938762623716291E-4, -9.76109749136146840777E-3,
7.78576235018280120474E-1};
T y = pabs(x);
T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
T y_gt_eight = pdiv(
internal::pchebevl<T, 25>::run(
psub(pdiv(pset1<T>(32.0), y),
pset1<T>(2.0)), B),
psqrt(y));
// TODO: Perhaps instead check whether all packet elements are in
// [-8, 8] and evaluate a branch based off of that. It's possible
// in practice most elements are in this region.
y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
return pselect(pcmp_lt(x, pset1<T>(0.0f)), -y, y);
}
};
double z = numext::abs(x);
if (z <= 8.0) {
double y = (z / 2.0) - 2.0;
z = cephes::chebevl<double, 29>::run(y, A) * z;
} else {
z = cephes::chebevl<double, 25>::run(32.0 / z - 2.0, B) / numext::sqrt(z);
}
if (x < 0.0) {
z = -z;
}
return z;
template <typename Scalar>
struct i1e_impl {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
return generic_i1e<Scalar, Scalar>::run(x);
}
};

View File

@ -76,13 +76,19 @@ Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext
* order zero i0e(\a a) (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pi0e(const Packet& x) { using numext::i0e; return i0e(x); }
Packet pi0e(const Packet& x) {
typedef typename unpacket_traits<Packet>::type ScalarType;
using internal::generic_i0e; return generic_i0e<Packet, ScalarType>::run(x);
}
/** \internal \returns the exponentially scaled modified Bessel function of
* order one i1e(\a a) (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pi1e(const Packet& x) { using numext::i1e; return i1e(x); }
Packet pi1e(const Packet& x) {
typedef typename unpacket_traits<Packet>::type ScalarType;
using internal::generic_i1e; return generic_i1e<Packet, ScalarType>::run(x);
}
} // end namespace internal