From 7ea35bfa1c0b4950feae65d49c0e6f2cbf3691d9 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@gmail.com>
Date: Thu, 3 Mar 2016 19:39:41 -0800
Subject: [PATCH 1/4] Initial implementation of igamma and igammac.

---
 Eigen/src/Core/GenericPacketMath.h        |  10 +
 Eigen/src/Core/GlobalFunctions.h          |  30 +++
 Eigen/src/Core/NumTraits.h                |   5 +
 Eigen/src/Core/SpecialFunctions.h         | 291 +++++++++++++++++++++-
 Eigen/src/Core/arch/CUDA/MathFunctions.h  |  18 ++
 Eigen/src/Core/arch/CUDA/PacketMath.h     |   4 +
 Eigen/src/Core/functors/BinaryFunctors.h  |  49 ++++
 Eigen/src/Core/util/ForwardDeclarations.h |   2 +
 Eigen/src/Core/util/Meta.h                |   5 +
 test/array.cpp                            |  47 +++-
 10 files changed, 459 insertions(+), 2 deletions(-)
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 02882bdea..ead0253df 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -78,6 +78,8 @@ struct default_packet_traits
     HasDiGamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasIGamma = 0,
+    HasIGammac = 0,
 
     HasRound  = 0,
     HasFloor  = 0,
@@ -457,6 +459,14 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 396da8e71..7df0fdda9 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -129,6 +129,36 @@ namespace Eigen
     );
   }
 
+  /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+    *
+    * This function computes the coefficient-wise incomplete gamma function.
+    *
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+  igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
+  {
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+      a.derived(),
+      x.derived()
+    );
+  }
+
+  /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+    *
+    * This function computes the coefficient-wise complementary incomplete gamma function.
+    *
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+  igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) 
+  {
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+      a.derived(),
+      x.derived()
+    );
+  }
+
   namespace internal
   {
     EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op)
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 6a596bb7d..7ddb4a867 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -95,6 +95,11 @@ template<typename T> struct GenericNumTraits
   static inline T infinity() {
     return numext::numeric_limits<T>::infinity();
   }
+
+  EIGEN_DEVICE_FUNC
+  static inline T quiet_NaN() {
+    return numext::numeric_limits<T>::quiet_NaN();
+  }
 };
 
 template<typename T> struct NumTraits : GenericNumTraits<T>
diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index b02ad9a1f..ff2146afc 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -283,7 +283,7 @@ struct digamma_impl {
     Scalar p, q, nz, s, w, y;
     bool negative;
 
-    const Scalar maxnum = numext::numeric_limits<Scalar>::infinity();
+    const Scalar maxnum = NumTraits<Scalar>::infinity();
     const Scalar m_pi = 3.14159265358979323846;
 
     negative = 0;
@@ -401,6 +401,282 @@ struct erfc_impl<double> {
 };
 #endif  // EIGEN_HAS_C99_MATH
 
+/****************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral)       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+  typedef Scalar type;
+};
+
+#ifndef EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*							igamc()
+     *
+     *	Incomplete gamma integral (modified for Eigen)
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igamc();
+     *
+     * y = igamc( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *
+     *  igamc(a,x)   =   1 - igam(a,x)
+     *
+     *                            inf.
+     *                              -
+     *                     1       | |  -t  a-1
+     *               =   -----     |   e   t   dt.
+     *                    -      | |
+     *                   | (a)    -
+     *                             x
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       7.8e-6      5.9e-7
+     *
+     *
+     * ACCURACY (double):
+     *
+     * Tested at random a, x.
+     *                a         x                      Relative error:
+     * arithmetic   domain   domain     # trials      peak         rms
+     *    IEEE     0.5,100   0,100      200000       1.9e-14     1.7e-15
+     *    IEEE     0.01,0.5  0,100      200000       1.4e-13     1.6e-15
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = NumTraits<Scalar>::epsilon();
+    const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
+    const Scalar big = one / machep;
+
+    Scalar ans, ax, c, yc, r, t, y, z;
+    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if ((x <= zero) || ( a <= zero)) {
+      return one;
+    }
+
+    if ((x < one) || (x < a)) {
+      return (one - igamma_impl<Scalar>::run(a, x));
+    }
+
+    ax = a * ::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if( ax < -maxlog ) {  // underflow
+      return zero;
+    }
+    ax = ::exp(ax);
+
+    // continued fraction
+    y = one - a;
+    z = x + y + one;
+    c = zero;
+    pkm2 = one;
+    qkm2 = x;
+    pkm1 = x + one;
+    qkm1 = z * x;
+    ans = pkm1/qkm1;
+
+    do {
+      c += one;
+      y += one;
+      z += two;
+      yc = y * c;
+      pk = pkm1 * z  -  pkm2 * yc;
+      qk = qkm1 * z  -  qkm2 * yc;
+      if( qk != zero ) {
+        r = pk/qk;
+        t = ::abs( (ans - r)/r );
+        ans = r;
+      } else  {
+        t = one;
+      }
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (::abs(pk) > big) {
+        pkm2 *= machep;
+        pkm1 *= machep;
+        qkm2 *= machep;
+        qkm1 *= machep;
+      }
+    } while( t > machep );
+
+    return ( ans * ax );
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/****************************************************************************
+ * Implementation of igamma (incomplete gamma integral)                     *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+#ifndef EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*							igam()
+     *	Incomplete gamma integral
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igam();
+     *
+     * y = igam( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *                           x
+     *                            -
+     *                   1       | |  -t  a-1
+     *  igam(a,x)  =   -----     |   e   t   dt.
+     *                  -      | |
+     *                 | (a)    -
+     *                           0
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (double):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30       200000       3.6e-14     2.9e-15
+     *    IEEE      0,100      300000       9.9e-14     1.5e-14
+     *
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        20000       7.8e-6      5.9e-7
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+
+
+    /* left tail of incomplete gamma function:
+     *
+     *          inf.      k
+     *   a  -x   -       x
+     *  x  e     >   ----------
+     *           -     -
+     *          k=0   | (a+k+1)
+     *
+     */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = NumTraits<Scalar>::epsilon();
+    const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
+
+    double ans, ax, c, r;
+
+    if( (x <= zero) || ( a <= zero) ) {
+      return zero;
+    }
+
+    if( (x > one) && (x > a ) ) {
+      return (one - igammac_impl<Scalar>::run(a,x));
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * ::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if( ax < -maxlog ) {
+      // underflow
+      return zero;
+    }
+    ax = ::exp(ax);
+
+    /* power series */
+    r = a;
+    c = one;
+    ans = one;
+
+    do {
+      r += one;
+      c *= x/r;
+      ans += c;
+    } while( c/ans > machep );
+
+    return( ans * ax/a );
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
 }  // end namespace internal
 
 namespace numext {
@@ -429,8 +705,21 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
   return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+    igamma(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+    igammac(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
 }  // end namespace numext
 
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index a2c06a817..6e84d3af8 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -116,6 +116,24 @@ double2 perfc<double2>(const double2& a)
   return make_double2(erfc(a.x), erfc(a.y));
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::pigamma;
+  return make_float4(
+    pigamma(a.x, x.x),
+    pigamma(a.y, x.y),
+    pigamma(a.z, x.z),
+    pigamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double& x)
+{
+  using numext::pigammac;
+  return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y));
+}
+
 
 #endif
 
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index d3d9f910e..d2563030b 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -43,6 +43,8 @@ template<> struct packet_traits<float> : default_packet_traits
     HasDiGamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasIgamma = 1,
+    HasIGammac = 1,
 
     HasBlend = 0,
   };
@@ -67,6 +69,8 @@ template<> struct packet_traits<double> : default_packet_traits
     HasDiGamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
 
     HasBlend = 0,
   };
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 4962d625c..5cdfff845 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -337,6 +337,55 @@ template<> struct functor_traits<scalar_boolean_or_op> {
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma
+  */
+template<typename Scalar> struct scalar_igamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igamma; return igamma(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igammac
+  */
+template<typename Scalar> struct scalar_igammac_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igammac; return igammac(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+  {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
 
 
 //---------- binary functors bound to a constant, thus appearing as a unary functor ----------
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index f09632375..a102e5457 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -206,6 +206,8 @@ template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
 template<typename Scalar,bool iscpx> struct scalar_sign_op;
+template<typename Scalar> struct scalar_igamma_op;
+template<typename Scalar> struct scalar_igammac_op;
 
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 6b35179f2..24e8a6d8a 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -148,6 +148,7 @@ template<typename T> struct numeric_limits
   static T (max)() { assert(false && "Highest not supported for this type"); }
   static T (min)() { assert(false && "Lowest not supported for this type"); }
   static T infinity() { assert(false && "Infinity not supported for this type"); }
+  static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
 };
 template<> struct numeric_limits<float>
 {
@@ -159,6 +160,8 @@ template<> struct numeric_limits<float>
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
   static float infinity() { return CUDART_INF_F; }
+  EIGEN_DEVICE_FUNC
+  static float quiet_NaN() { return CUDART_NAN_F; }
 };
 template<> struct numeric_limits<double>
 {
@@ -170,6 +173,8 @@ template<> struct numeric_limits<double>
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
   static double infinity() { return CUDART_INF; }
+  EIGEN_DEVICE_FUNC
+  static double quiet_NaN() { return CUDART_NAN; }
 };
 template<> struct numeric_limits<int>
 {
diff --git a/test/array.cpp b/test/array.cpp
index 96aef31c7..a37874cc2 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -295,7 +295,6 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square());
   VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square());
   VERIFY_IS_APPROX(pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0)));
-  
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
@@ -305,6 +304,14 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
 
+  // Smoke test to check any compilation issues
+  ArrayType m1_abs_p1 = m1.abs() + 1;
+  ArrayType m2_abs_p1 = m2.abs() + 1;
+  VERIFY_IS_APPROX(Eigen::igamma(m1_abs_p1, m2_abs_p1), Eigen::igamma(m1_abs_p1, m2_abs_p1));
+  VERIFY_IS_APPROX(Eigen::igammac(m1_abs_p1, m2_abs_p1), Eigen::igammac(m1_abs_p1, m2_abs_p1));
+  VERIFY_IS_APPROX(Eigen::igamma(m2_abs_p1, m1_abs_p1), Eigen::igamma(m2_abs_p1, m1_abs_p1));
+  VERIFY_IS_APPROX(Eigen::igammac(m2_abs_p1, m1_abs_p1), Eigen::igammac(m2_abs_p1, m1_abs_p1));
+
   // scalar by array division
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
   s1 += Scalar(tiny);
@@ -323,6 +330,44 @@ template<typename ArrayType> void array_real(const ArrayType& m)
                     std::numeric_limits<RealScalar>::infinity());
     VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)),
                     std::numeric_limits<RealScalar>::infinity());
+
+    Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)};
+    Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)};
+
+    // location i*6+j corresponds to a_s[i], x_s[j].
+    Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+    Scalar igamma_s[][6] = {
+      {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658,
+       9.999500016666262e-05, 1.0},
+      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686,
+       7.522076445089201e-07, 1.0},
+      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909,
+       4.166333347221828e-18, 1.0},
+      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988,
+       0.9991370418689945, 1.0},
+      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}};
+    Scalar igammac_s[][6] = {
+      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
+      {1.0, 0.36787944117144233, 0.22313016014842982,
+       0.018315638888734182, 0.9999000049998333, 0.0},
+      {1.0, 0.5724067044708798, 0.3916251762710878,
+       0.04601170568923136, 0.9999992477923555, 0.0},
+      {1.0, 0.9810118431238462, 0.9343575456215499,
+       0.4334701203667089, 1.0, 0.0},
+      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+       3.7801620118431334e-07, 0.0008629581310054535, 0.0},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}};
+    for (int i = 0; i < 6; ++i) {
+      for (int j = 0; j < 6; ++j) {
+        //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl;
+        //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " <<
+        //igammac_s[i][j] << std::endl;
+        std::cout << a_s[i] << ", " << x_s[j] << std::endl;
+        VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+        VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+      }
+    }
   }
 #endif  // EIGEN_HAS_C99_MATH
 

From 0b9e0abc96d5c0367ee6c443f71754637b0db7e4 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@gmail.com>
Date: Fri, 4 Mar 2016 21:12:10 -0800
Subject: [PATCH 2/4] Make igamma and igammac work correctly.

This required replacing ::abs with std::abs.
Modified some unit tests.
---
 Eigen/src/Core/SpecialFunctions.h | 119 +++++++++++++++++++++---------
 test/array.cpp                    |  64 ++++++++--------
 2 files changed, 119 insertions(+), 64 deletions(-)

diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index ff2146afc..4a61325d4 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -296,7 +296,8 @@ struct digamma_impl {
     if (x <= zero) {
       negative = one;
       q = x;
-      p = ::floor(q);
+      using std::floor;
+      p = floor(q);
       if (p == q) {
         return maxnum;
       }
@@ -309,7 +310,8 @@ struct digamma_impl {
           p += one;
           nz = q - p;
         }
-        nz = m_pi / ::tan(m_pi * nz);
+        using std::tan;
+        nz = m_pi / tan(m_pi * nz);
       }
       else {
         nz = zero;
@@ -327,7 +329,8 @@ struct digamma_impl {
 
     y = digamma_impl_maybe_poly<Scalar>::run(s);
 
-    y = ::log(s) - (half / s) - y - w;
+    using std::log;
+    y = log(s) - (half / s) - y - w;
 
     return (negative) ? y - nz : y;
   }
@@ -426,6 +429,39 @@ struct igammac_impl {
 
 template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
 
+template <typename Scalar>
+struct igamma_helper {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+};
+
+template <>
+struct igamma_helper<float> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0 / (NumTraits<float>::epsilon() / 2);
+  }
+};
+
+template <>
+struct igamma_helper<double> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static double big() {
+    return 1.0 / NumTraits<double>::epsilon();
+  }
+};
+
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -487,26 +523,35 @@ struct igammac_impl {
     const Scalar zero = 0;
     const Scalar one = 1;
     const Scalar two = 2;
-    const Scalar machep = NumTraits<Scalar>::epsilon();
+    const Scalar machep = igamma_helper<Scalar>::machep();
     const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
-    const Scalar big = one / machep;
+    const Scalar big = igamma_helper<Scalar>::big();
+    const Scalar biginv = 1 / big;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
     Scalar ans, ax, c, yc, r, t, y, z;
     Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
 
-    if ((x <= zero) || ( a <= zero)) {
-      return one;
+    if ((x < zero) || ( a <= zero)) {
+      // domain error
+      return nan;
     }
 
     if ((x < one) || (x < a)) {
       return (one - igamma_impl<Scalar>::run(a, x));
     }
 
-    ax = a * ::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if( ax < -maxlog ) {  // underflow
+    using std::isinf;
+    if ((isinf)(x)) return zero;
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    using std::log;
+    ax = a * log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {  // underflow
       return zero;
     }
-    ax = ::exp(ax);
+    using std::exp;
+    ax = exp(ax);
 
     // continued fraction
     y = one - a;
@@ -516,35 +561,36 @@ struct igammac_impl {
     qkm2 = x;
     pkm1 = x + one;
     qkm1 = z * x;
-    ans = pkm1/qkm1;
+    ans = pkm1 / qkm1;
 
+    using std::abs;
     do {
       c += one;
       y += one;
       z += two;
       yc = y * c;
-      pk = pkm1 * z  -  pkm2 * yc;
-      qk = qkm1 * z  -  qkm2 * yc;
-      if( qk != zero ) {
-        r = pk/qk;
-        t = ::abs( (ans - r)/r );
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != zero) {
+        r = pk / qk;
+        t = abs((ans - r) / r);
         ans = r;
-      } else  {
+      } else {
         t = one;
       }
       pkm2 = pkm1;
       pkm1 = pk;
       qkm2 = qkm1;
       qkm1 = qk;
-      if (::abs(pk) > big) {
-        pkm2 *= machep;
-        pkm1 *= machep;
-        qkm2 *= machep;
-        qkm1 *= machep;
+      if (abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
       }
-    } while( t > machep );
+    } while (t > machep);
 
-    return ( ans * ax );
+    return (ans * ax);
   }
 };
 
@@ -639,26 +685,31 @@ struct igamma_impl {
      */
     const Scalar zero = 0;
     const Scalar one = 1;
-    const Scalar machep = NumTraits<Scalar>::epsilon();
+    const Scalar machep = igamma_helper<Scalar>::machep();
     const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
     double ans, ax, c, r;
 
-    if( (x <= zero) || ( a <= zero) ) {
-      return zero;
+    if (x == zero) return zero;
+
+    if ((x < zero) || ( a <= zero)) {  // domain error
+      return nan;
     }
 
-    if( (x > one) && (x > a ) ) {
-      return (one - igammac_impl<Scalar>::run(a,x));
+    if ((x > one) && (x > a)) {
+      return (one - igammac_impl<Scalar>::run(a, x));
     }
 
     /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * ::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if( ax < -maxlog ) {
+    using std::log;
+    ax = a * log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {
       // underflow
       return zero;
     }
-    ax = ::exp(ax);
+    using std::exp;
+    ax = exp(ax);
 
     /* power series */
     r = a;
@@ -669,9 +720,9 @@ struct igamma_impl {
       r += one;
       c *= x/r;
       ans += c;
-    } while( c/ans > machep );
+    } while (c/ans > machep);
 
-    return( ans * ax/a );
+    return (ans * ax / a);
   }
 };
 
diff --git a/test/array.cpp b/test/array.cpp
index a37874cc2..c61bfc8ed 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -331,41 +331,45 @@ template<typename ArrayType> void array_real(const ArrayType& m)
     VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)),
                     std::numeric_limits<RealScalar>::infinity());
 
-    Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)};
-    Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)};
+    Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+    Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
 
     // location i*6+j corresponds to a_s[i], x_s[j].
     Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-    Scalar igamma_s[][6] = {
-      {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658,
-       9.999500016666262e-05, 1.0},
-      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686,
-       7.522076445089201e-07, 1.0},
-      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909,
-       4.166333347221828e-18, 1.0},
-      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988,
-       0.9991370418689945, 1.0},
-      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}};
-    Scalar igammac_s[][6] = {
-      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
-      {1.0, 0.36787944117144233, 0.22313016014842982,
-       0.018315638888734182, 0.9999000049998333, 0.0},
-      {1.0, 0.5724067044708798, 0.3916251762710878,
-       0.04601170568923136, 0.9999992477923555, 0.0},
-      {1.0, 0.9810118431238462, 0.9343575456215499,
-       0.4334701203667089, 1.0, 0.0},
-      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-       3.7801620118431334e-07, 0.0008629581310054535, 0.0},
-      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}};
+    Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                            {0.0, 0.6321205588285578, 0.7768698398515702,
+                             0.9816843611112658, 9.999500016666262e-05, 1.0},
+                            {0.0, 0.4275932955291202, 0.608374823728911,
+                             0.9539882943107686, 7.522076445089201e-07, 1.0},
+                            {0.0, 0.01898815687615381, 0.06564245437845008,
+                             0.5665298796332909, 4.166333347221828e-18, 1.0},
+                            {0.0, 0.9999780593618628, 0.9999899967080838,
+                             0.9999996219837988, 0.9991370418689945, 1.0},
+                            {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+    Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                             {1.0, 0.36787944117144233, 0.22313016014842982,
+                              0.018315638888734182, 0.9999000049998333, 0.0},
+                             {1.0, 0.5724067044708798, 0.3916251762710878,
+                              0.04601170568923136, 0.9999992477923555, 0.0},
+                             {1.0, 0.9810118431238462, 0.9343575456215499,
+                              0.4334701203667089, 1.0, 0.0},
+                             {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                              3.7801620118431334e-07, 0.0008629581310054535,
+                              0.0},
+                             {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
     for (int i = 0; i < 6; ++i) {
       for (int j = 0; j < 6; ++j) {
-        //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl;
-        //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " <<
-        //igammac_s[i][j] << std::endl;
-        std::cout << a_s[i] << ", " << x_s[j] << std::endl;
-        VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
-        VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+        if ((std::isnan)(igamma_s[i][j])) {
+          VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+        } else {
+          VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+        }
+
+        if ((std::isnan)(igammac_s[i][j])) {
+          VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+        } else {
+          VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+        }
       }
     }
   }

From 5707004d6b947c202085c3ead889e277264ea36a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@gmail.com>
Date: Mon, 7 Mar 2016 14:08:56 -0800
Subject: [PATCH 3/4] Fix Eigen's building of sharded tests that use CUDA &
 more igamma/igammac bugfixes.

0. Prior to this PR, not a single sharded CUDA test was actually being *run*.
Fixed that.

GPU tests are still failing for igamma/igammac.

1. Add calls for igamma/igammac to TensorBase
2. Fix up CUDA-specific calls of igamma/igammac
3. Add unit tests for digamma, igamma, igammac in CUDA.
---
 Eigen/src/Core/GenericPacketMath.h            |   4 +-
 Eigen/src/Core/arch/CUDA/MathFunctions.h      |  34 ++-
 Eigen/src/Core/arch/CUDA/PacketMath.h         |   1 -
 cmake/EigenTesting.cmake                      |   7 +-
 .../Eigen/CXX11/src/Tensor/TensorBase.h       |  15 ++
 unsupported/test/CMakeLists.txt               |  16 +-
 unsupported/test/cxx11_tensor_cuda.cu         | 205 ++++++++++++++++++
 7 files changed, 268 insertions(+), 14 deletions(-)

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index ead0253df..802def51d 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -460,11 +460,11 @@ template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
 
 /***************************************************************************
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index 6e84d3af8..6822700f8 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -116,24 +116,42 @@ double2 perfc<double2>(const double2& a)
   return make_double2(erfc(a.x), erfc(a.y));
 }
 
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigamma<float4>(const float4& a, const float4& x)
 {
-  using numext::pigamma;
+  using numext::igamma;
   return make_float4(
-    pigamma(a.x, x.x),
-    pigamma(a.y, x.y),
-    pigamma(a.z, x.z),
-    pigamma(a.w, x.w));
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double& x)
+double2 pigamma<double2>(const double2& a, const double2& x)
 {
-  using numext::pigammac;
-  return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y));
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
 
 #endif
 
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index d2563030b..25d964600 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -284,7 +284,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
   return make_double2(fabs(a.x), fabs(a.y));
 }
 
-
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 5022397a7..5ca800cfe 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -19,12 +19,15 @@ macro(ei_add_test_internal testname testname_with_suffix)
   endif()
   
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
-    cuda_add_executable(${targetname} ${filename})
+    if (${ARGC} GREATER 2)
+      cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+    else()
+      cuda_add_executable(${targetname} ${filename})
+    endif()
   else()
     add_executable(${targetname} ${filename})
   endif()
   
-  
   if (targetname MATCHES "^eigen2_")
     add_dependencies(eigen2_buildtests ${targetname})
   else()
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 4dea1d3a0..aa67b9811 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -315,12 +315,27 @@ class TensorBase<Derived, ReadOnlyAccessors>
     operator==(const OtherDerived& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
     }
+
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
     }
 
+    // igamma(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+    igamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+    }
+
+    // igammac(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+    igammac(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+    }
+
     // comparisons and tests for Scalars
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index c202cf0e4..17f83915b 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,3 +1,17 @@
+# generate split test header file only if it does not yet exist
+# in order to prevent a rebuild everytime cmake is configured
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)  
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
+  foreach(i RANGE 1 999)
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
+      "#ifdef EIGEN_TEST_PART_${i}\n"
+      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+      "#else\n"
+      "#define CALL_SUBTEST_${i}(FUNC)\n"
+      "#endif\n\n"
+    )
+  endforeach()
+endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
 add_custom_target(BuildUnsupported)
@@ -158,7 +172,7 @@ endif()
 # These tests needs nvcc
 find_package(CUDA 7.0)
 if(CUDA_FOUND)
-  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+#  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 58da21d3b..348271e4b 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -574,6 +574,195 @@ void test_cuda_lgamma(const Scalar stddev)
   cudaFree(d_out);
 }
 
+template <typename Scalar>
+void test_cuda_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+}
+
+template <typename Scalar>
+void test_cuda_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[i];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        printf("got: %g\n", out(i, j));
+        //VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[i];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        printf("got: %g\n", out(i, j));
+        //VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+}
+
 template <typename Scalar>
 void test_cuda_erf(const Scalar stddev)
 {
@@ -667,30 +856,46 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
   CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
   CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+
   CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
   CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_digamma<float>());
+
   CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
   CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
   CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
   CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+
   CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
   CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
   CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
   CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+
   CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
   CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_digamma<double>());
+
   CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
   CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
   CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
   CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+
   CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
   CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
   CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
   CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
+
+  CALL_SUBTEST_5(test_cuda_igamma<float>());
+  CALL_SUBTEST_5(test_cuda_igammac<float>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<double>());
+  CALL_SUBTEST_5(test_cuda_igammac<double>());
 }

From 0bb5de05a131e955190e9a408bdc0e00b1929745 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@gmail.com>
Date: Mon, 7 Mar 2016 15:35:09 -0800
Subject: [PATCH 4/4] Finishing touches on igamma/igammac for GPU.  Tests now
 pass.

---
 Eigen/src/Core/SpecialFunctions.h     | 22 ++++++++++++----------
 unsupported/test/cxx11_tensor_cuda.cu | 12 ++++--------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index 4a61325d4..567c02c61 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -520,14 +520,16 @@ struct igammac_impl {
       Copyright 1985, 1987, 1992 by Stephen L. Moshier
       Direct inquiries to 30 Frost Street, Cambridge, MA 02140
     */
+    using std::log;
     const Scalar zero = 0;
     const Scalar one = 1;
     const Scalar two = 2;
     const Scalar machep = igamma_helper<Scalar>::machep();
-    const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
+    const Scalar maxlog = log(NumTraits<Scalar>::highest());
     const Scalar big = igamma_helper<Scalar>::big();
     const Scalar biginv = 1 / big;
     const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+    const Scalar inf = NumTraits<Scalar>::infinity();
 
     Scalar ans, ax, c, yc, r, t, y, z;
     Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
@@ -541,11 +543,9 @@ struct igammac_impl {
       return (one - igamma_impl<Scalar>::run(a, x));
     }
 
-    using std::isinf;
-    if ((isinf)(x)) return zero;
+    if (x == inf) return zero;  // std::isinf crashes on CUDA
 
     /* Compute  x**a * exp(-x) / gamma(a)  */
-    using std::log;
     ax = a * log(x) - x - lgamma_impl<Scalar>::run(a);
     if (ax < -maxlog) {  // underflow
       return zero;
@@ -564,7 +564,7 @@ struct igammac_impl {
     ans = pkm1 / qkm1;
 
     using std::abs;
-    do {
+    while (true) {
       c += one;
       y += one;
       z += two;
@@ -588,7 +588,8 @@ struct igammac_impl {
         qkm2 *= biginv;
         qkm1 *= biginv;
       }
-    } while (t > machep);
+      if (t <= machep) break;
+    }
 
     return (ans * ax);
   }
@@ -683,10 +684,11 @@ struct igamma_impl {
      *          k=0   | (a+k+1)
      *
      */
+    using std::log;
     const Scalar zero = 0;
     const Scalar one = 1;
     const Scalar machep = igamma_helper<Scalar>::machep();
-    const Scalar maxlog = ::log(NumTraits<Scalar>::highest());
+    const Scalar maxlog = log(NumTraits<Scalar>::highest());
     const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
     double ans, ax, c, r;
@@ -702,7 +704,6 @@ struct igamma_impl {
     }
 
     /* Compute  x**a * exp(-x) / gamma(a)  */
-    using std::log;
     ax = a * log(x) - x - lgamma_impl<Scalar>::run(a);
     if (ax < -maxlog) {
       // underflow
@@ -716,11 +717,12 @@ struct igamma_impl {
     c = one;
     ans = one;
 
-    do {
+    while (true) {
       r += one;
       c *= x/r;
       ans += c;
-    } while (c/ans > machep);
+      if (c/ans <= machep) break;
+    }
 
     return (ans * ax / a);
   }
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 348271e4b..1964d9e07 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -632,7 +632,6 @@ void test_cuda_igamma()
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
   Tensor<Scalar, 2> out(6, 6);
-  Tensor<Scalar, 2> expected_out(6, 6);
   out.setZero();
 
   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -641,7 +640,7 @@ void test_cuda_igamma()
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
       a(i, j) = a_s[i];
-      x(i, j) = x_s[i];
+      x(i, j) = x_s[j];
     }
   }
 
@@ -686,8 +685,7 @@ void test_cuda_igamma()
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
       if ((std::isnan)(igamma_s[i][j])) {
-        printf("got: %g\n", out(i, j));
-        //VERIFY((std::isnan)(out(i, j)));
+        VERIFY((std::isnan)(out(i, j)));
       } else {
         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
       }
@@ -701,7 +699,6 @@ void test_cuda_igammac()
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
   Tensor<Scalar, 2> out(6, 6);
-  Tensor<Scalar, 2> expected_out(6, 6);
   out.setZero();
 
   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -710,7 +707,7 @@ void test_cuda_igammac()
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
       a(i, j) = a_s[i];
-      x(i, j) = x_s[i];
+      x(i, j) = x_s[j];
     }
   }
 
@@ -754,8 +751,7 @@ void test_cuda_igammac()
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
       if ((std::isnan)(igammac_s[i][j])) {
-        printf("got: %g\n", out(i, j));
-        //VERIFY((std::isnan)(out(i, j)));
+        VERIFY((std::isnan)(out(i, j)));
       } else {
         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
       }