PR 681: Add ndtri function, the inverse of the normal distribution function.

2025-09-18 04:13:14 +08:00 · 2019-08-12 19:26:29 -04:00 · 2019-08-12 19:26:29 -04:00 · e38dd48a27
commit e38dd48a27
parent f59bed7a13
20 changed files with 498 additions and 70 deletions
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -83,6 +83,7 @@ struct default_packet_traits
    HasPolygamma = 0,
    HasErf = 0,
    HasErfc = 0,
    HasNdtri = 0,
    HasI0e = 0,
    HasI1e = 0,
    HasIGamma = 0,
@ -257,12 +258,32 @@ pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent);
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pzero(const Packet& a) { return pxor(a,a); }
 template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
  EIGEN_UNUSED_VARIABLE(a);
  return 0.;
 }
 template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
  EIGEN_UNUSED_VARIABLE(a);
  return 0.;
 }
 /** \internal \returns bits of \a or \b according to the input bit mask \a mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pselect(const Packet& mask, const Packet& a, const Packet& b) {
  return por(pand(a,mask),pandnot(b,mask));
 }
 template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
    const float& mask, const float& a, const float&b) {
  return mask == 0 ? b : a;
 }
 template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
    const double& mask, const double& a, const double& b) {
  return mask == 0 ? b : a;
 }
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -76,6 +76,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -72,6 +72,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasLog1p  = 1,
    HasExpm1  = 1,
    HasExp  = 1,
    HasNdtri = 1,
    HasSqrt = 1,
    HasRsqrt = 1,
    HasTanh  = EIGEN_FAST_MATH,
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -97,6 +97,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasLog = 1,
    HasLog1p  = 1,
    HasExpm1  = 1,
    HasNdtri = 1,
 #endif
    HasExp = 1,
    HasSqrt = EIGEN_FAST_MATH,
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@ -512,5 +512,60 @@ Packet pcos_float(const Packet& x)
  return psincos_float<false>(x);
 }
 /* polevl (modified for Eigen)
 *
 *      Evaluate polynomial
 *
 *
 *
 * SYNOPSIS:
 *
 * int N;
 * Scalar x, y, coef[N+1];
 *
 * y = polevl<decltype(x), N>( x, coef);
 *
 *
 *
 * DESCRIPTION:
 *
 * Evaluates polynomial of degree N:
 *
 *                     2          N
 * y  =  C  + C x + C x  +...+ C x
 *        0    1     2          N
 *
 * Coefficients are stored in reverse order:
 *
 * coef[0] = C  , ..., coef[N] = C  .
 *            N                   0
 *
 *  The function p1evl() assumes that coef[N] = 1.0 and is
 * omitted from the array.  Its calling arguments are
 * otherwise the same as polevl().
 *
 *
 * The Eigen implementation is templatized.  For best speed, store
 * coef as a const array (constexpr), e.g.
 *
 * const double coef[] = {1.0, 2.0, 3.0, ...};
 *
 */
 template <typename Packet, int N>
 struct ppolevl {
  static EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
  }
 };
 template <typename Packet>
 struct ppolevl<Packet, 0> {
  static EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
    EIGEN_UNUSED_VARIABLE(x);
    return pset1<Packet>(coeff[0]);
  }
 };
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@ -44,6 +44,7 @@ template<> struct packet_traits<float> : default_packet_traits
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
    HasNdtri = 1,
    HasI0e = 1,
    HasI1e = 1,
    HasIGamma = 1,
@ -78,6 +79,7 @@ template<> struct packet_traits<double> : default_packet_traits
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
    HasNdtri = 1,
    HasI0e = 1,
    HasI1e = 1,
    HasIGamma = 1,
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -112,6 +112,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasLog  = 1,
    HasLog1p  = 1,
    HasExpm1  = 1,
    HasNdtri = 1,
    HasExp  = 1,
    HasSqrt = 1,
    HasRsqrt = 1,
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@ -54,6 +54,7 @@ struct sycl_packet_traits : default_packet_traits {
    HasPolygamma = 0,
    HasErf = 0,
    HasErfc = 0,
    HasNdtri = 0,
    HasIGamma = 0,
    HasIGammac = 0,
    HasBetaInc = 0,
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -214,6 +214,7 @@ template<typename Scalar> struct scalar_lgamma_op;
 template<typename Scalar> struct scalar_digamma_op;
 template<typename Scalar> struct scalar_erf_op;
 template<typename Scalar> struct scalar_erfc_op;
 template<typename Scalar> struct scalar_ndtri_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
 template<typename Scalar> struct scalar_zeta_op;
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@ -536,14 +536,12 @@ typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaRe
 typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
 /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
  *
  * \specialfunctions_module
  *
  * Example: \include Cwise_lgamma.cpp
  * Output: \verbinclude Cwise_lgamma.out
  *
  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
  * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
  * type T to be supported.
@ -579,9 +577,6 @@ digamma() const
  *
  * \specialfunctions_module
  *
  * Example: \include Cwise_erf.cpp
  * Output: \verbinclude Cwise_erf.out
  *
  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
  * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
  * type T to be supported.
@ -600,9 +595,6 @@ erf() const
  *
  * \specialfunctions_module
  *
  * Example: \include Cwise_erfc.cpp
  * Output: \verbinclude Cwise_erfc.out
  *
  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
  * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
  * type T to be supported.
@ -615,3 +607,21 @@ erfc() const
 {
  return ErfcReturnType(derived());
 }
 /** \cpp11 \returns an expression of the coefficient-wise Complementary error
  * function of *this.
  *
  * \specialfunctions_module
  *
  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
  * or float/double in non c++11 mode, the user has to provide implementations of ndtri(T) for any scalar
  * type T to be supported.
  *
  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>, erf()
  */
 EIGEN_DEVICE_FUNC
 inline const NdtriReturnType
 ndtri() const
 {
  return NdtriReturnType(derived());
 }
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -590,6 +590,12 @@ template<typename Scalar,typename Packet> void packetmath_real()
    h.store(data2, internal::perfc(h.load(data1)));
    VERIFY((numext::isnan)(data2[0]));
  }
  {
    for (int i=0; i<size; ++i) {
      data1[i] = internal::random<Scalar>(0,1);
    }
    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
  }
 #endif  // EIGEN_HAS_C99_MATH
  for (int i=0; i<size; ++i)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@ -201,6 +201,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
      return unaryExpr(internal::scalar_erfc_op<Scalar>());
    }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
    ndtri() const {
      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
    }
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
    sigmoid() const {
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@ -33,6 +33,7 @@ namespace Eigen {
  * - gamma_sample_der_alpha
  * - igammac
  * - digamma
  * - ndtri
  * - polygamma
  * - zeta
  * - betainc
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@ -283,6 +283,31 @@ struct functor_traits<scalar_erfc_op<Scalar> >
  };
 };
 /** \internal
 * \brief Template functor to compute the Inverse of the normal distribution
 * function of a scalar
 * \sa class CwiseUnaryOp, Cwise::ndtri()
 */
 template<typename Scalar> struct scalar_ndtri_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
    using numext::ndtri; return ndtri(a);
  }
  typedef typename packet_traits<Scalar>::type Packet;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_ndtri_op<Scalar> >
 {
  enum {
    // On average, We are evaluating rational functions with degree N=9 in the
    // numerator and denominator. This results in 2*N additions and 2*N
    // multiplications.
    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
    PacketAccess = packet_traits<Scalar>::HasNdtri
  };
 };
 /** \internal
 * \brief Template functor to compute the exponentially scaled modified Bessel
 * function of order zero
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@ -30,6 +30,9 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
 }
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@ -38,63 +38,6 @@ namespace internal {
 namespace cephes {
 /* polevl (modified for Eigen)
 *
 *      Evaluate polynomial
 *
 *
 *
 * SYNOPSIS:
 *
 * int N;
 * Scalar x, y, coef[N+1];
 *
 * y = polevl<decltype(x), N>( x, coef);
 *
 *
 *
 * DESCRIPTION:
 *
 * Evaluates polynomial of degree N:
 *
 *                     2          N
 * y  =  C  + C x + C x  +...+ C x
 *        0    1     2          N
 *
 * Coefficients are stored in reverse order:
 *
 * coef[0] = C  , ..., coef[N] = C  .
 *            N                   0
 *
 *  The function p1evl() assumes that coef[N] = 1.0 and is
 * omitted from the array.  Its calling arguments are
 * otherwise the same as polevl().
 *
 *
 * The Eigen implementation is templatized.  For best speed, store
 * coef as a const array (constexpr), e.g.
 *
 * const double coef[] = {1.0, 2.0, 3.0, ...};
 *
 */
 template <typename Scalar, int N>
 struct polevl {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
  }
 };
 template <typename Scalar>
 struct polevl<Scalar, 0> {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
    return coef[0];
  }
 };
 /* chbevl (modified for Eigen)
 *
 *     Evaluate Chebyshev series
@ -264,7 +207,7 @@ struct digamma_impl_maybe_poly<float> {
    float z;
    if (s < 1.0e8f) {
      z = 1.0f / (s * s);
-      return z * cephes::polevl<float, 3>::run(z, A);
+      return z * internal::ppolevl<float, 3>::run(z, A);
    } else return 0.0f;
  }
 };
@ -286,7 +229,7 @@ struct digamma_impl_maybe_poly<double> {
    double z;
    if (s < 1.0e17) {
      z = 1.0 / (s * s);
-      return z * cephes::polevl<double, 6>::run(z, A);
+      return z * internal::ppolevl<double, 6>::run(z, A);
    }
    else return 0.0;
  }
@ -494,6 +437,246 @@ struct erfc_impl<double> {
 };
 #endif  // EIGEN_HAS_C99_MATH
 /***************************************************************************
 * Implementation of ndtri.                                                 *
 ****************************************************************************/
 /* Inverse of Normal distribution function (modified for Eigen).
 *
 *
 * SYNOPSIS:
 *
 * double x, y, ndtri();
 *
 * x = ndtri( y );
 *
 *
 *
 * DESCRIPTION:
 *
 * Returns the argument, x, for which the area under the
 * Gaussian probability density function (integrated from
 * minus infinity to x) is equal to y.
 *
 *
 * For small arguments 0 < y < exp(-2), the program computes
 * z = sqrt( -2.0 * log(y) );  then the approximation is
 * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
 * There are two rational functions P/Q, one for 0 < y < exp(-32)
 * and the other for y up to exp(-2).  For larger arguments,
 * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
 *
 *
 * ACCURACY:
 *
 *                      Relative error:
 * arithmetic   domain        # trials      peak         rms
 *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
 *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
 *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
 *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
 *
 *
 * ERROR MESSAGES:
 *
 *   message         condition    value returned
 * ndtri domain       x <= 0        -MAXNUM
 * ndtri domain       x >= 1         MAXNUM
 *
 */
 /*
   Cephes Math Library Release 2.2: June, 1992
   Copyright 1985, 1987, 1992 by Stephen L. Moshier
   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
 */
 // TODO: Add a cheaper approximation for float.
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
    const T& should_flipsign, const T& x) {
  const T sign_mask = pset1<T>(-0.0);
  T sign_bit = pand<T>(should_flipsign, sign_mask);
  return pxor<T>(sign_bit, x);
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
    const double& should_flipsign, const double& x) {
  return should_flipsign == 0 ? x : -x;
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
    const float& should_flipsign, const float& x) {
  return should_flipsign == 0 ? x : -x;
 }
 // We split this computation in to two so that in the scalar path
 // only one branch is evaluated (due to our template specialization of pselect
 // being an if statement.)
 template <typename T, typename ScalarType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
  const ScalarType p0[] = {
    ScalarType(-5.99633501014107895267e1),
    ScalarType(9.80010754185999661536e1),
    ScalarType(-5.66762857469070293439e1),
    ScalarType(1.39312609387279679503e1),
    ScalarType(-1.23916583867381258016e0)
  };
  const ScalarType q0[] = {
    ScalarType(1.0),
    ScalarType(1.95448858338141759834e0),
    ScalarType(4.67627912898881538453e0),
    ScalarType(8.63602421390890590575e1),
    ScalarType(-2.25462687854119370527e2),
    ScalarType(2.00260212380060660359e2),
    ScalarType(-8.20372256168333339912e1),
    ScalarType(1.59056225126211695515e1),
    ScalarType(-1.18331621121330003142e0)
  };
  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
  const T half = pset1<T>(ScalarType(0.5));
  T c, c2, ndtri_gt_exp_neg_two;
  c = psub(b, half);
  c2 = pmul(c, c);
  ndtri_gt_exp_neg_two = pmadd(c, pmul(
      c2, pdiv(
          internal::ppolevl<T, 4>::run(c2, p0),
          internal::ppolevl<T, 8>::run(c2, q0))), c);
  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
 }
 template <typename T, typename ScalarType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
    const T& b, const T& should_flipsign) {
  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
   */
  const ScalarType p1[] = {
    ScalarType(4.05544892305962419923e0),
    ScalarType(3.15251094599893866154e1),
    ScalarType(5.71628192246421288162e1),
    ScalarType(4.40805073893200834700e1),
    ScalarType(1.46849561928858024014e1),
    ScalarType(2.18663306850790267539e0),
    ScalarType(-1.40256079171354495875e-1),
    ScalarType(-3.50424626827848203418e-2),
    ScalarType(-8.57456785154685413611e-4)
  };
  const ScalarType q1[] = {
    ScalarType(1.0),
    ScalarType(1.57799883256466749731e1),
    ScalarType(4.53907635128879210584e1),
    ScalarType(4.13172038254672030440e1),
    ScalarType(1.50425385692907503408e1),
    ScalarType(2.50464946208309415979e0),
    ScalarType(-1.42182922854787788574e-1),
    ScalarType(-3.80806407691578277194e-2),
    ScalarType(-9.33259480895457427372e-4)
  };
  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
   */
  const ScalarType p2[] = {
    ScalarType(3.23774891776946035970e0),
    ScalarType(6.91522889068984211695e0),
    ScalarType(3.93881025292474443415e0),
    ScalarType(1.33303460815807542389e0),
    ScalarType(2.01485389549179081538e-1),
    ScalarType(1.23716634817820021358e-2),
    ScalarType(3.01581553508235416007e-4),
    ScalarType(2.65806974686737550832e-6),
    ScalarType(6.23974539184983293730e-9)
  };
  const ScalarType q2[] = {
    ScalarType(1.0),
    ScalarType(6.02427039364742014255e0),
    ScalarType(3.67983563856160859403e0),
    ScalarType(1.37702099489081330271e0),
    ScalarType(2.16236993594496635890e-1),
    ScalarType(1.34204006088543189037e-2),
    ScalarType(3.28014464682127739104e-4),
    ScalarType(2.89247864745380683936e-6),
    ScalarType(6.79019408009981274425e-9)
  };
  const T eight = pset1<T>(ScalarType(8.0));
  const T one = pset1<T>(ScalarType(1));
  const T neg_two = pset1<T>(ScalarType(-2));
  T x, x0, x1, z;
  x = psqrt(pmul(neg_two, plog(b)));
  x0 = psub(x, pdiv(plog(x), x));
  z = one / x;
  x1 = pmul(
      z, pselect(
          pcmp_lt(x, eight),
          pdiv(internal::ppolevl<T, 8>::run(z, p1),
               internal::ppolevl<T, 8>::run(z, q1)),
          pdiv(internal::ppolevl<T, 8>::run(z, p2),
               internal::ppolevl<T, 8>::run(z, q2))));
  return flipsign(should_flipsign, psub(x0, x1));
 }
 template <typename T, typename ScalarType>
 T generic_ndtri(const T& a) {
  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
  const T zero = pset1<T>(ScalarType(0));
  const T one = pset1<T>(ScalarType(1));
  // exp(-2)
  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
  T b, ndtri, should_flipsign;
  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
  b = pselect(should_flipsign, a, psub(one, a));
  ndtri = pselect(
      pcmp_lt(exp_neg_two, b),
      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
  return pselect(
      pcmp_le(a, zero), neg_maxnum,
      pselect(pcmp_le(one, a), maxnum, ndtri));
 }
 template <typename Scalar>
 struct ndtri_retval {
  typedef Scalar type;
 };
 #if !EIGEN_HAS_C99_MATH
 template <typename Scalar>
 struct ndtri_impl {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                        THIS_TYPE_IS_NOT_SUPPORTED);
    return Scalar(0);
  }
 };
 # else
 template <typename Scalar>
 struct ndtri_impl {
  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
    return generic_ndtri<Scalar, Scalar>(x);
  }
 };
 #endif  // EIGEN_HAS_C99_MATH
 /**************************************************************************************************************
 * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
 **************************************************************************************************************/
@ -2120,6 +2303,12 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
    ndtri(const Scalar& x) {
  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
 }
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
    igamma(const Scalar& a, const Scalar& x) {
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@ -38,6 +38,13 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 /** \internal \returns the ndtri(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pndtri(const Packet& a) {
  typedef typename unpacket_traits<Packet>::type ScalarType;
  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
 }
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
--- a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
@ -101,6 +101,19 @@ double2 perfc<double2>(const double2& a)
  return make_double2(erfc(a.x), erfc(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pndtri<float4>(const float4& a)
 {
  using numext::ndtri;
  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double2 pndtri<double2>(const double2& a)
 {
  using numext::ndtri;
  return make_double2(ndtri(a.x), ndtri(a.y));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigamma<float4>(const float4& a, const float4& x)
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@ -1069,6 +1069,66 @@ void test_gpu_erfc(const Scalar stddev)
  gpuFree(d_out);
 }
 #endif
 template <typename Scalar>
 void test_gpu_ndtri()
 {
  Tensor<Scalar, 1> in_x(8);
  Tensor<Scalar, 1> out(8);
  Tensor<Scalar, 1> expected_out(8);
  out.setZero();
  in_x(0) = Scalar(1);
  in_x(1) = Scalar(0.);
  in_x(2) = Scalar(0.5);
  in_x(3) = Scalar(0.2);
  in_x(4) = Scalar(0.8);
  in_x(5) = Scalar(0.9);
  in_x(6) = Scalar(0.1);
  in_x(7) = Scalar(0.99);
  in_x(8) = Scalar(0.01);
  expected_out(0) = std::numeric_limits<Scalar>::infinity();
  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
  expected_out(2) = Scalar(0.0);
  expected_out(3) = Scalar(-0.8416212335729142);
  expected_out(4) = Scalar(0.8416212335729142);j
  expected_out(5) = Scalar(1.2815515655446004);
  expected_out(6) = Scalar(-1.2815515655446004);
  expected_out(7) = Scalar(2.3263478740408408);
  expected_out(8) = Scalar(-2.3263478740408408);
  std::size_t bytes = in_x.size() * sizeof(Scalar);
  Scalar* d_in_x;
  Scalar* d_out;
  gpuMalloc((void**)(&d_in_x), bytes);
  gpuMalloc((void**)(&d_out), bytes);
  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);
  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  VERIFY_IS_EQUAL(out(0), expected_out(0));
  VERIFY((std::isnan)(out(3)));
  for (int i = 1; i < 6; ++i) {
    if (i != 3) {
      VERIFY_IS_APPROX(out(i), expected_out(i));
    }
  }
  gpuFree(d_in_x);
  gpuFree(d_out);
 }
 template <typename Scalar>
 void test_gpu_betainc()
@ -1538,6 +1598,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 #if !defined(EIGEN_USE_HIP)
 // disable these tests on HIP for now.
  CALL_SUBTEST_5(test_gpu_ndtri<float>());
  CALL_SUBTEST_5(test_gpu_ndtri<double>());
  CALL_SUBTEST_5(test_gpu_digamma<float>());
  CALL_SUBTEST_5(test_gpu_digamma<double>());
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@ -133,6 +133,26 @@ template<typename ArrayType> void array_special_functions()
  }
 #endif  // EIGEN_HAS_C99_MATH
  // Check the ndtri function against scipy.special.ndtri
  {
    ArrayType x(7), res(7), ref(7);
    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
    CALL_SUBTEST( verify_component_wise(ref, ref); );
    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
    // ndtri(normal_cdf(x)) ~= x
    CALL_SUBTEST(
        ArrayType m1 = ArrayType::Random(32);
        using std::sqrt;
        ArrayType cdf_val = (m1 / sqrt(2.)).erf();
        cdf_val = (cdf_val + 1.) / 2.;
        verify_component_wise(cdf_val.ndtri(), m1););
  }
  // Check the zeta function against scipy.special.zeta
  {
    ArrayType x(7), q(7), res(7), ref(7);