From 74dcfbbd0f74922c954aa55cca673037a099e8df Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Mon, 7 Oct 2024 13:27:28 -0700
Subject: [PATCH] Use ppolevl for polynomial evaluation in more places.

---
 .../arch/Default/GenericPacketMathFunctions.h | 464 +++++++-----------
 .../SpecialFunctions/SpecialFunctionsImpl.h   |  94 ++--
 2 files changed, 223 insertions(+), 335 deletions(-)
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 5bce19464..c3270e132 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -42,6 +42,134 @@ struct make_integer<bfloat16> {
   typedef numext::int16_t type;
 };
 
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
 template <typename Packet>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
@@ -193,21 +321,14 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const
   e = psub(e, pand(cst_1, mask));
   x = padd(x, tmp);
 
-  // Polynomial coefficients for rational (3,3) r(x) = p(x)/q(x)
+  // Polynomial coefficients for rational r(x) = p(x)/q(x)
   // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
-  const Packet cst_p1 = pset1<Packet>(1.0000000190281136f);
-  const Packet cst_p2 = pset1<Packet>(1.0000000190281063f);
-  const Packet cst_p3 = pset1<Packet>(0.18256296349849254f);
-  const Packet cst_q1 = pset1<Packet>(1.4999999999999927f);
-  const Packet cst_q2 = pset1<Packet>(0.59923249590823520f);
-  const Packet cst_q3 = pset1<Packet>(0.049616247954120038f);
+  constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
+  constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
 
-  Packet p = pmadd(x, cst_p3, cst_p2);
-  p = pmadd(x, p, cst_p1);
+  Packet p = ppolevl<Packet, 2>::run(x, alpha);
   p = pmul(x, p);
-  Packet q = pmadd(x, cst_q3, cst_q2);
-  q = pmadd(x, q, cst_q1);
-  q = pmadd(x, q, cst_1);
+  Packet q = ppolevl<Packet, 3>::run(x, beta);
   x = pdiv(p, q);
 
   // Add the logarithm of the exponent back to the result of the interpolation.
@@ -919,13 +1040,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
   const Packet cst_one = pset1<Packet>(1.0f);
   const Packet cst_two = pset1<Packet>(2.0f);
   const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
-  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
-  // even terms only.
-  const Packet p9 = pset1<Packet>(5.08838854730129241943359375e-2f);
-  const Packet p7 = pset1<Packet>(3.95139865577220916748046875e-2f);
-  const Packet p5 = pset1<Packet>(7.550220191478729248046875e-2f);
-  const Packet p3 = pset1<Packet>(0.16664917767047882080078125f);
-  const Packet p1 = pset1<Packet>(1.00000011920928955078125f);
 
   const Packet abs_x = pabs(x_in);
   const Packet sign_mask = pandnot(x_in, abs_x);
@@ -941,13 +1055,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Pac
   const Packet x = pselect(large_mask, x_large, abs_x);
   const Packet x2 = pmul(x, x);
 
-  // Compute polynomial.
-  // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
-
-  Packet p = pmadd(p9, x2, p7);
-  p = pmadd(p, x2, p5);
-  p = pmadd(p, x2, p3);
-  p = pmadd(p, x2, p1);
+  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+  // even terms only.
+  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
+                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
   p = pmul(p, x);
 
   const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
@@ -967,34 +1079,17 @@ struct patan_reduced {
 template <>
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
-  const Packet p1 = pset1<Packet>(3.3004361289279920e-01);
-  const Packet p3 = pset1<Packet>(8.2704055405494614e-01);
-  const Packet p5 = pset1<Packet>(7.5365702534987022e-01);
-  const Packet p7 = pset1<Packet>(3.0409318473444424e-01);
-  const Packet p9 = pset1<Packet>(5.2574296781008604e-02);
-  const Packet p11 = pset1<Packet>(3.0917513112462781e-03);
-  const Packet p13 = pset1<Packet>(2.6667153866462208e-05);
-  const Packet q0 = p1;
-  const Packet q2 = pset1<Packet>(9.3705509168587852e-01);
-  const Packet q4 = pset1<Packet>(1.0);
-  const Packet q6 = pset1<Packet>(4.9716458728465573e-01);
-  const Packet q8 = pset1<Packet>(1.1548932646420353e-01);
-  const Packet q10 = pset1<Packet>(1.0899150928962708e-02);
-  const Packet q12 = pset1<Packet>(2.7311202462436667e-04);
+  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
+                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
+                              3.3004361289279920e-01};
+
+  constexpr double beta[] = {
+      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
+      9.3705509168587852e-01, 3.3004361289279920e-01};
 
   Packet x2 = pmul(x, x);
-  Packet p = pmadd(p13, x2, p11);
-  Packet q = pmadd(q12, x2, q10);
-  p = pmadd(p, x2, p9);
-  q = pmadd(q, x2, q8);
-  p = pmadd(p, x2, p7);
-  q = pmadd(q, x2, q6);
-  p = pmadd(p, x2, p5);
-  q = pmadd(q, x2, q4);
-  p = pmadd(p, x2, p3);
-  q = pmadd(q, x2, q2);
-  p = pmadd(p, x2, p1);
-  q = pmadd(q, x2, q0);
+  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 6>::run(x2, beta);
   return pmul(x, pdiv(p, q));
 }
 
@@ -1002,20 +1097,14 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>
 template <>
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
-  const Packet p1 = pset1<Packet>(8.109951019287109375e-01f);
-  const Packet p3 = pset1<Packet>(7.296695709228515625e-01f);
-  const Packet p5 = pset1<Packet>(1.12026982009410858154296875e-01f);
-  const Packet q0 = p1;
-  const Packet q2 = pset1<Packet>(1.0f);
-  const Packet q4 = pset1<Packet>(2.8318560123443603515625e-01f);
-  const Packet q6 = pset1<Packet>(1.00917108356952667236328125e-02f);
+  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
+
+  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
+                            8.109951019287109375e-01f};
 
   Packet x2 = pmul(x, x);
-  Packet p = pmadd(p5, x2, p3);
-  Packet q = pmadd(q6, x2, q4);
-  p = pmadd(p, x2, p1);
-  q = pmadd(q, x2, q2);
-  q = pmadd(q, x2, q0);
+  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 3>::run(x2, beta);
   return pmul(x, pdiv(p, q));
 }
 
@@ -1076,34 +1165,20 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x)
   //   --output=tanhf.sollya --dispCoeff="dec"
 
   // The monomial coefficients of the numerator polynomial (odd).
-  const T alpha_3 = pset1<T>(1.340216100e-1f);
-  const T alpha_5 = pset1<T>(3.520756727e-3f);
-  const T alpha_7 = pset1<T>(2.102733560e-5f);
-  const T alpha_9 = pset1<T>(1.394553628e-8f);
+  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
 
   // The monomial coefficients of the denominator polynomial (even).
-  const T beta_0 = pset1<T>(1.0f);
-  const T beta_2 = pset1<T>(4.673548340e-1f);
-  const T beta_4 = pset1<T>(2.597254514e-2f);
-  const T beta_6 = pset1<T>(3.326951409e-4f);
-  const T beta_8 = pset1<T>(8.015776984e-7f);
+  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
 
   // Since the polynomials are odd/even, we need x^2.
   const T x2 = pmul(x, x);
   const T x3 = pmul(x2, x);
 
-  // Interleave the evaluation of the numerator polynomial p and
-  // denominator polynomial q.
-  T p = pmadd(x2, alpha_9, alpha_7);
-  T q = pmadd(x2, beta_8, beta_6);
-  p = pmadd(x2, p, alpha_5);
-  q = pmadd(x2, q, beta_4);
-  p = pmadd(x2, p, alpha_3);
-  q = pmadd(x2, q, beta_2);
-  // Take advantage of the fact that alpha_1 = 1 to compute
-  // x*(x^2*p + alpha_1) = x^3 * p + x.
+  T p = ppolevl<T, 3>::run(x2, alpha);
+  T q = ppolevl<T, 4>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
   p = pmadd(x3, p, x);
-  q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
   return pdiv(p, q);
@@ -1141,27 +1216,16 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x)
   //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
 
   // The monomial coefficients of the numerator polynomial (odd).
-  const T alpha_3 = pset1<T>(1.5184719640284322e-01);
-  const T alpha_5 = pset1<T>(5.9809711724441161e-03);
-  const T alpha_7 = pset1<T>(9.3839087674268880e-05);
-  const T alpha_9 = pset1<T>(6.8644367682497074e-07);
-  const T alpha_11 = pset1<T>(2.4618379131293676e-09);
-  const T alpha_13 = pset1<T>(4.2303918148209176e-12);
-  const T alpha_15 = pset1<T>(3.1309488231386680e-15);
-  const T alpha_17 = pset1<T>(7.6534862268749319e-19);
-  const T alpha_19 = pset1<T>(2.6158007860482230e-23);
+  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
+                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
+                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
 
   // The monomial coefficients of the denominator polynomial (even).
-  const T beta_0 = pset1<T>(1.0);
-  const T beta_2 = pset1<T>(4.851805297361760360e-01);
-  const T beta_4 = pset1<T>(3.437448108450402717e-02);
-  const T beta_6 = pset1<T>(8.295161192716231542e-04);
-  const T beta_8 = pset1<T>(8.785185266237658698e-06);
-  const T beta_10 = pset1<T>(4.492975677839633985e-08);
-  const T beta_12 = pset1<T>(1.123643448069621992e-10);
-  const T beta_14 = pset1<T>(1.293019623712687916e-13);
-  const T beta_16 = pset1<T>(5.782506856739003571e-17);
-  const T beta_18 = pset1<T>(6.463747022670968018e-21);
+  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
+                             1.293019623712687916e-13, 1.123643448069621992e-10,
+                             4.492975677839633985e-08, 8.785185266237658698e-06,
+                             8.295161192716231542e-04, 3.437448108450402717e-02,
+                             4.851805297361760360e-01, 1.0};
 
   // Since the polynomials are odd/even, we need x^2.
   const T x2 = pmul(x, x);
@@ -1169,26 +1233,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x)
 
   // Interleave the evaluation of the numerator polynomial p and
   // denominator polynomial q.
-  T p = pmadd(x2, alpha_19, alpha_17);
-  T q = pmadd(x2, beta_18, beta_16);
-  p = pmadd(x2, p, alpha_15);
-  q = pmadd(x2, q, beta_14);
-  p = pmadd(x2, p, alpha_13);
-  q = pmadd(x2, q, beta_12);
-  p = pmadd(x2, p, alpha_11);
-  q = pmadd(x2, q, beta_10);
-  p = pmadd(x2, p, alpha_9);
-  q = pmadd(x2, q, beta_8);
-  p = pmadd(x2, p, alpha_7);
-  q = pmadd(x2, q, beta_6);
-  p = pmadd(x2, p, alpha_5);
-  q = pmadd(x2, q, beta_4);
-  p = pmadd(x2, p, alpha_3);
-  q = pmadd(x2, q, beta_2);
-  // Take advantage of the fact that alpha_1 = 1 to compute
-  // x*(x^2*p + alpha_1) = x^3 * p + x.
+  T p = ppolevl<T, 8>::run(x2, alpha);
+  T q = ppolevl<T, 9>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
   p = pmadd(x3, p, x);
-  q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
   return pdiv(p, q);
@@ -1200,18 +1249,13 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Pa
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
   // For |x| in [0:0.5] we use a polynomial approximation of the form
-  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
-  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
-  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
-  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
-  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
-  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
+  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
+  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
+                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
   const Packet x2 = pmul(x, x);
-  Packet p = pmadd(C11, x2, C9);
-  p = pmadd(x2, p, C7);
-  p = pmadd(x2, p, C5);
-  p = pmadd(x2, p, C3);
-  p = pmadd(pmul(x, x2), p, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmadd(x3, p, x);
 
   // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
   const Packet half = pset1<Packet>(0.5f);
@@ -1234,30 +1278,16 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const P
   static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
   // For x in [-0.5:0.5] we use a rational approximation of the form
   // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
-  const Packet p0 = pset1<Packet>(1.2306328729812676e-01);
-  const Packet p2 = pset1<Packet>(-2.5949536095445679e-01);
-  const Packet p4 = pset1<Packet>(1.8185306179826699e-01);
-  const Packet p6 = pset1<Packet>(-4.7129526768798737e-02);
-  const Packet p8 = pset1<Packet>(3.3071338469301391e-03);
+  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
+                              -2.5949536095445679e-01, 1.2306328729812676e-01};
 
-  const Packet q0 = pset1<Packet>(3.6918986189438030e-01);
-  const Packet q2 = pset1<Packet>(-1.0000000000000000e+00);
-  const Packet q4 = pset1<Packet>(9.8733495886883648e-01);
-  const Packet q6 = pset1<Packet>(-4.2828141436397615e-01);
-  const Packet q8 = pset1<Packet>(7.6391885763341910e-02);
-  const Packet q10 = pset1<Packet>(-3.8679974580640881e-03);
+  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
+                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
 
   const Packet x2 = pmul(x, x);
   const Packet x3 = pmul(x, x2);
-  Packet q = pmadd(q10, x2, q8);
-  Packet p = pmadd(p8, x2, p6);
-  q = pmadd(x2, q, q6);
-  p = pmadd(x2, p, p4);
-  q = pmadd(x2, q, q4);
-  p = pmadd(x2, p, p2);
-  q = pmadd(x2, q, q2);
-  p = pmadd(x2, p, p0);
-  q = pmadd(x2, q, q0);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 5>::run(x2, beta);
   Packet y_small = pmadd(x3, pdiv(p, q), x);
 
   // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
@@ -2200,134 +2230,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Pac
                                                          pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))));
 }
 
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
-                                                          const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
-
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
-                                                          const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
-
 namespace unary_pow {
 
 template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 56615d367..9f12a3413 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -131,8 +131,8 @@ struct digamma_impl_maybe_poly {
 template <>
 struct digamma_impl_maybe_poly<float> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) {
-    const float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
-                       8.33333333333333333333E-2f};
+    constexpr float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
+                           8.33333333333333333333E-2f};
 
     float z;
     if (s < 1.0e8f) {
@@ -146,9 +146,9 @@ struct digamma_impl_maybe_poly<float> {
 template <>
 struct digamma_impl_maybe_poly<double> {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double s) {
-    const double A[] = {8.33333333333333333333E-2,  -2.10927960927960927961E-2, 7.57575757575757575758E-3,
-                        -4.16666666666666666667E-3, 3.96825396825396825397E-3,  -8.33333333333333333333E-3,
-                        8.33333333333333333333E-2};
+    constexpr double A[] = {8.33333333333333333333E-2,  -2.10927960927960927961E-2, 7.57575757575757575758E-3,
+                            -4.16666666666666666667E-3, 3.96825396825396825397E-3,  -8.33333333333333333333E-3,
+                            8.33333333333333333333E-2};
 
     double z;
     if (s < 1.0e17) {
@@ -282,20 +282,14 @@ struct digamma_impl {
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
   // The monomial coefficients of the numerator polynomial (odd).
-  const T alpha_1 = pset1<T>(1.128379106521606445312500000000000000e+00f);
-  const T alpha_3 = pset1<T>(1.874160766601562500000000000000000000e-01f);
-  const T alpha_5 = pset1<T>(5.243302136659622192382812500000000000e-02f);
-  const T alpha_7 = pset1<T>(3.658048342913389205932617187500000000e-03f);
-  const T alpha_9 = pset1<T>(2.861979592125862836837768554687500000e-04f);
-  const T alpha_11 = pset1<T>(2.123732201653183437883853912353515625e-06f);
+  constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
+                             3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
+                             1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
 
   // The monomial coefficients of the denominator polynomial (even).
-  const T beta_0 = pset1<T>(1.0f);
-  const T beta_2 = pset1<T>(4.99425798654556274414062500000e-01f);
-  const T beta_4 = pset1<T>(1.12945675849914550781250000000e-01f);
-  const T beta_6 = pset1<T>(1.47520881146192550659179687500e-02f);
-  const T beta_8 = pset1<T>(1.14329601638019084930419921875e-03f);
-  const T beta_10 = pset1<T>(3.89185734093189239501953125000e-05f);
+  constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
+                            1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
+                            4.99425798654556274414062500000e-01f, 1.0f};
 
   // Since the polynomials are odd/even, we need x^2.
   // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
@@ -303,19 +297,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
   const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
 
   // Evaluate the numerator polynomial p.
-  T p = pmadd(x2, alpha_11, alpha_9);
-  p = pmadd(x2, p, alpha_7);
-  p = pmadd(x2, p, alpha_5);
-  p = pmadd(x2, p, alpha_3);
-  p = pmadd(x2, p, alpha_1);
+  T p = ppolevl<T, 5>::run(x2, alpha);
   p = pmul(x, p);
 
   // Evaluate the denominator polynomial p.
-  T q = pmadd(x2, beta_10, beta_8);
-  q = pmadd(x2, q, beta_6);
-  q = pmadd(x2, q, beta_4);
-  q = pmadd(x2, q, beta_2);
-  q = pmadd(x2, q, beta_0);
+  T q = ppolevl<T, 5>::run(x2, beta);
   const T r = pdiv(p, q);
 
   // Clamp to [-1:1].
@@ -475,18 +461,18 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(const float& should_
 
 template <typename T, typename ScalarType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
-  const ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
-                           ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
-                           ScalarType(-1.23916583867381258016e0)};
-  const ScalarType q0[] = {ScalarType(1.0),
-                           ScalarType(1.95448858338141759834e0),
-                           ScalarType(4.67627912898881538453e0),
-                           ScalarType(8.63602421390890590575e1),
-                           ScalarType(-2.25462687854119370527e2),
-                           ScalarType(2.00260212380060660359e2),
-                           ScalarType(-8.20372256168333339912e1),
-                           ScalarType(1.59056225126211695515e1),
-                           ScalarType(-1.18331621121330003142e0)};
+  constexpr ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
+                               ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
+                               ScalarType(-1.23916583867381258016e0)};
+  constexpr ScalarType q0[] = {ScalarType(1.0),
+                               ScalarType(1.95448858338141759834e0),
+                               ScalarType(4.67627912898881538453e0),
+                               ScalarType(8.63602421390890590575e1),
+                               ScalarType(-2.25462687854119370527e2),
+                               ScalarType(2.00260212380060660359e2),
+                               ScalarType(-8.20372256168333339912e1),
+                               ScalarType(1.59056225126211695515e1),
+                               ScalarType(-1.18331621121330003142e0)};
   const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
   const T half = pset1<T>(ScalarType(0.5));
   T c, c2, ndtri_gt_exp_neg_two;
@@ -503,20 +489,20 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(const T& b,
   /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
    * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
    */
-  const ScalarType p1[] = {ScalarType(4.05544892305962419923e0),   ScalarType(3.15251094599893866154e1),
-                           ScalarType(5.71628192246421288162e1),   ScalarType(4.40805073893200834700e1),
-                           ScalarType(1.46849561928858024014e1),   ScalarType(2.18663306850790267539e0),
-                           ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
-                           ScalarType(-8.57456785154685413611e-4)};
-  const ScalarType q1[] = {ScalarType(1.0),
-                           ScalarType(1.57799883256466749731e1),
-                           ScalarType(4.53907635128879210584e1),
-                           ScalarType(4.13172038254672030440e1),
-                           ScalarType(1.50425385692907503408e1),
-                           ScalarType(2.50464946208309415979e0),
-                           ScalarType(-1.42182922854787788574e-1),
-                           ScalarType(-3.80806407691578277194e-2),
-                           ScalarType(-9.33259480895457427372e-4)};
+  constexpr ScalarType p1[] = {ScalarType(4.05544892305962419923e0),   ScalarType(3.15251094599893866154e1),
+                               ScalarType(5.71628192246421288162e1),   ScalarType(4.40805073893200834700e1),
+                               ScalarType(1.46849561928858024014e1),   ScalarType(2.18663306850790267539e0),
+                               ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
+                               ScalarType(-8.57456785154685413611e-4)};
+  constexpr ScalarType q1[] = {ScalarType(1.0),
+                               ScalarType(1.57799883256466749731e1),
+                               ScalarType(4.53907635128879210584e1),
+                               ScalarType(4.13172038254672030440e1),
+                               ScalarType(1.50425385692907503408e1),
+                               ScalarType(2.50464946208309415979e0),
+                               ScalarType(-1.42182922854787788574e-1),
+                               ScalarType(-3.80806407691578277194e-2),
+                               ScalarType(-9.33259480895457427372e-4)};
   /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
    * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
    */
@@ -1267,7 +1253,7 @@ struct zeta_impl {
     int i;
     Scalar p, r, a, b, k, s, t, w;
 
-    const Scalar A[] = {
+    constexpr Scalar A[] = {
         Scalar(12.0),
         Scalar(-720.0),
         Scalar(30240.0),