From 7ea35bfa1c0b4950feae65d49c0e6f2cbf3691d9 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Thu, 3 Mar 2016 19:39:41 -0800 Subject: [PATCH 1/4] Initial implementation of igamma and igammac. --- Eigen/src/Core/GenericPacketMath.h | 10 + Eigen/src/Core/GlobalFunctions.h | 30 +++ Eigen/src/Core/NumTraits.h | 5 + Eigen/src/Core/SpecialFunctions.h | 291 +++++++++++++++++++++- Eigen/src/Core/arch/CUDA/MathFunctions.h | 18 ++ Eigen/src/Core/arch/CUDA/PacketMath.h | 4 + Eigen/src/Core/functors/BinaryFunctors.h | 49 ++++ Eigen/src/Core/util/ForwardDeclarations.h | 2 + Eigen/src/Core/util/Meta.h | 5 + test/array.cpp | 47 +++- 10 files changed, 459 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 02882bdea..ead0253df 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -78,6 +78,8 @@ struct default_packet_traits HasDiGamma = 0, HasErf = 0, HasErfc = 0, + HasIGamma = 0, + HasIGammac = 0, HasRound = 0, HasFloor = 0, @@ -457,6 +459,14 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 396da8e71..7df0fdda9 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -129,6 +129,36 @@ namespace Eigen ); } + /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + + /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + */ + template + inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> + igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) + { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + namespace internal { EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 6a596bb7d..7ddb4a867 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -95,6 +95,11 @@ template struct GenericNumTraits static inline T infinity() { return numext::numeric_limits::infinity(); } + + EIGEN_DEVICE_FUNC + static inline T quiet_NaN() { + return numext::numeric_limits::quiet_NaN(); + } }; template struct NumTraits : GenericNumTraits diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index b02ad9a1f..ff2146afc 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -283,7 +283,7 @@ struct digamma_impl { Scalar p, q, nz, s, w, y; bool negative; - const Scalar maxnum = numext::numeric_limits::infinity(); + const Scalar maxnum = NumTraits::infinity(); const Scalar m_pi = 3.14159265358979323846; negative = 0; @@ -401,6 +401,282 @@ struct erfc_impl { }; #endif // EIGEN_HAS_C99_MATH +/**************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral) * + ****************************************************************************/ + +template +struct igammac_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template struct igamma_impl; // predeclare igamma_impl + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = NumTraits::epsilon(); + const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar big = one / machep; + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if ((x <= zero) || ( a <= zero)) { + return one; + } + + if ((x < one) || (x < a)) { + return (one - igamma_impl::run(a, x)); + } + + ax = a * ::log(x) - x - lgamma_impl::run(a); + if( ax < -maxlog ) { // underflow + return zero; + } + ax = ::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1/qkm1; + + do { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if( qk != zero ) { + r = pk/qk; + t = ::abs( (ans - r)/r ); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (::abs(pk) > big) { + pkm2 *= machep; + pkm1 *= machep; + qkm2 *= machep; + qkm1 *= machep; + } + } while( t > machep ); + + return ( ans * ax ); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of igamma (incomplete gamma integral) * + ****************************************************************************/ + +template +struct igamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = NumTraits::epsilon(); + const Scalar maxlog = ::log(NumTraits::highest()); + + double ans, ax, c, r; + + if( (x <= zero) || ( a <= zero) ) { + return zero; + } + + if( (x > one) && (x > a ) ) { + return (one - igammac_impl::run(a,x)); + } + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * ::log(x) - x - lgamma_impl::run(a); + if( ax < -maxlog ) { + // underflow + return zero; + } + ax = ::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + do { + r += one; + c *= x/r; + ans += c; + } while( c/ans > machep ); + + return( ans * ax/a ); + } +}; + +#endif // EIGEN_HAS_C99_MATH + } // end namespace internal namespace numext { @@ -429,8 +705,21 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); } +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + } // end namespace numext + } // end namespace Eigen #endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index a2c06a817..6e84d3af8 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -116,6 +116,24 @@ double2 perfc(const double2& a) return make_double2(erfc(a.x), erfc(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma(const float4& a, const float4& x) +{ + using numext::pigamma; + return make_float4( + pigamma(a.x, x.x), + pigamma(a.y, x.y), + pigamma(a.z, x.z), + pigamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double& x) +{ + using numext::pigammac; + return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y)); +} + #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d3d9f910e..d2563030b 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -43,6 +43,8 @@ template<> struct packet_traits : default_packet_traits HasDiGamma = 1, HasErf = 1, HasErfc = 1, + HasIgamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -67,6 +69,8 @@ template<> struct packet_traits : default_packet_traits HasDiGamma = 1, HasErf = 1, HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, HasBlend = 0, }; diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 4962d625c..5cdfff845 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -337,6 +337,55 @@ template<> struct functor_traits { }; }; +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template struct scalar_igamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template struct scalar_igammac_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGammac + }; +}; //---------- binary functors bound to a constant, thus appearing as a unary functor ---------- diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index f09632375..a102e5457 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -206,6 +206,8 @@ template struct scalar_add_op; template struct scalar_constant_op; template struct scalar_identity_op; template struct scalar_sign_op; +template struct scalar_igamma_op; +template struct scalar_igammac_op; template struct scalar_product_op; template struct scalar_multiple2_op; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6b35179f2..24e8a6d8a 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -148,6 +148,7 @@ template struct numeric_limits static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } + static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } }; template<> struct numeric_limits { @@ -159,6 +160,8 @@ template<> struct numeric_limits static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC static float infinity() { return CUDART_INF_F; } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { return CUDART_NAN_F; } }; template<> struct numeric_limits { @@ -170,6 +173,8 @@ template<> struct numeric_limits static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC static double infinity() { return CUDART_INF; } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { return CUDART_NAN; } }; template<> struct numeric_limits { diff --git a/test/array.cpp b/test/array.cpp index 96aef31c7..a37874cc2 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -295,7 +295,6 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(Eigen::pow(m1,2*exponents), m1.square().square()); VERIFY_IS_APPROX(m1.pow(2*exponents), m1.square().square()); VERIFY_IS_APPROX(pow(m1(0,0), exponents), ArrayType::Constant(rows,cols,m1(0,0)*m1(0,0))); - VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt()); VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt()); @@ -305,6 +304,14 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(log10(m3), log(m3)/log(10)); + // Smoke test to check any compilation issues + ArrayType m1_abs_p1 = m1.abs() + 1; + ArrayType m2_abs_p1 = m2.abs() + 1; + VERIFY_IS_APPROX(Eigen::igamma(m1_abs_p1, m2_abs_p1), Eigen::igamma(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m1_abs_p1, m2_abs_p1), Eigen::igammac(m1_abs_p1, m2_abs_p1)); + VERIFY_IS_APPROX(Eigen::igamma(m2_abs_p1, m1_abs_p1), Eigen::igamma(m2_abs_p1, m1_abs_p1)); + VERIFY_IS_APPROX(Eigen::igammac(m2_abs_p1, m1_abs_p1), Eigen::igammac(m2_abs_p1, m1_abs_p1)); + // scalar by array division const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); @@ -323,6 +330,44 @@ template void array_real(const ArrayType& m) std::numeric_limits::infinity()); VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + + // location i*6+j corresponds to a_s[i], x_s[j]. + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = { + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, + 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, + 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, + 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, + 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}}; + Scalar igammac_s[][6] = { + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl; + //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " << + //igammac_s[i][j] << std::endl; + std::cout << a_s[i] << ", " << x_s[j] << std::endl; + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } + } } #endif // EIGEN_HAS_C99_MATH From 0b9e0abc96d5c0367ee6c443f71754637b0db7e4 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Fri, 4 Mar 2016 21:12:10 -0800 Subject: [PATCH 2/4] Make igamma and igammac work correctly. This required replacing ::abs with std::abs. Modified some unit tests. --- Eigen/src/Core/SpecialFunctions.h | 119 +++++++++++++++++++++--------- test/array.cpp | 64 ++++++++-------- 2 files changed, 119 insertions(+), 64 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index ff2146afc..4a61325d4 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -296,7 +296,8 @@ struct digamma_impl { if (x <= zero) { negative = one; q = x; - p = ::floor(q); + using std::floor; + p = floor(q); if (p == q) { return maxnum; } @@ -309,7 +310,8 @@ struct digamma_impl { p += one; nz = q - p; } - nz = m_pi / ::tan(m_pi * nz); + using std::tan; + nz = m_pi / tan(m_pi * nz); } else { nz = zero; @@ -327,7 +329,8 @@ struct digamma_impl { y = digamma_impl_maybe_poly::run(s); - y = ::log(s) - (half / s) - y - w; + using std::log; + y = log(s) - (half / s) - y - w; return (negative) ? y - nz : y; } @@ -426,6 +429,39 @@ struct igammac_impl { template struct igamma_impl; // predeclare igamma_impl +template +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static Scalar big() { assert(false && "big not supported for this type"); return 0.0; } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0 / (NumTraits::epsilon() / 2); + } +}; + +template <> +struct igamma_helper { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static double big() { + return 1.0 / NumTraits::epsilon(); + } +}; + template struct igammac_impl { EIGEN_DEVICE_FUNC @@ -487,26 +523,35 @@ struct igammac_impl { const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; - const Scalar machep = NumTraits::epsilon(); + const Scalar machep = igamma_helper::machep(); const Scalar maxlog = ::log(NumTraits::highest()); - const Scalar big = one / machep; + const Scalar big = igamma_helper::big(); + const Scalar biginv = 1 / big; + const Scalar nan = NumTraits::quiet_NaN(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; - if ((x <= zero) || ( a <= zero)) { - return one; + if ((x < zero) || ( a <= zero)) { + // domain error + return nan; } if ((x < one) || (x < a)) { return (one - igamma_impl::run(a, x)); } - ax = a * ::log(x) - x - lgamma_impl::run(a); - if( ax < -maxlog ) { // underflow + using std::isinf; + if ((isinf)(x)) return zero; + + /* Compute x**a * exp(-x) / gamma(a) */ + using std::log; + ax = a * log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { // underflow return zero; } - ax = ::exp(ax); + using std::exp; + ax = exp(ax); // continued fraction y = one - a; @@ -516,35 +561,36 @@ struct igammac_impl { qkm2 = x; pkm1 = x + one; qkm1 = z * x; - ans = pkm1/qkm1; + ans = pkm1 / qkm1; + using std::abs; do { c += one; y += one; z += two; yc = y * c; - pk = pkm1 * z - pkm2 * yc; - qk = qkm1 * z - qkm2 * yc; - if( qk != zero ) { - r = pk/qk; - t = ::abs( (ans - r)/r ); + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = abs((ans - r) / r); ans = r; - } else { + } else { t = one; } pkm2 = pkm1; pkm1 = pk; qkm2 = qkm1; qkm1 = qk; - if (::abs(pk) > big) { - pkm2 *= machep; - pkm1 *= machep; - qkm2 *= machep; - qkm1 *= machep; + if (abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; } - } while( t > machep ); + } while (t > machep); - return ( ans * ax ); + return (ans * ax); } }; @@ -639,26 +685,31 @@ struct igamma_impl { */ const Scalar zero = 0; const Scalar one = 1; - const Scalar machep = NumTraits::epsilon(); + const Scalar machep = igamma_helper::machep(); const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar nan = NumTraits::quiet_NaN(); double ans, ax, c, r; - if( (x <= zero) || ( a <= zero) ) { - return zero; + if (x == zero) return zero; + + if ((x < zero) || ( a <= zero)) { // domain error + return nan; } - if( (x > one) && (x > a ) ) { - return (one - igammac_impl::run(a,x)); + if ((x > one) && (x > a)) { + return (one - igammac_impl::run(a, x)); } /* Compute x**a * exp(-x) / gamma(a) */ - ax = a * ::log(x) - x - lgamma_impl::run(a); - if( ax < -maxlog ) { + using std::log; + ax = a * log(x) - x - lgamma_impl::run(a); + if (ax < -maxlog) { // underflow return zero; } - ax = ::exp(ax); + using std::exp; + ax = exp(ax); /* power series */ r = a; @@ -669,9 +720,9 @@ struct igamma_impl { r += one; c *= x/r; ans += c; - } while( c/ans > machep ); + } while (c/ans > machep); - return( ans * ax/a ); + return (ans * ax / a); } }; diff --git a/test/array.cpp b/test/array.cpp index a37874cc2..c61bfc8ed 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -331,41 +331,45 @@ template void array_real(const ArrayType& m) VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)), std::numeric_limits::infinity()); - Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; - Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(10000.5)}; + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; // location i*6+j corresponds to a_s[i], x_s[j]. Scalar nan = std::numeric_limits::quiet_NaN(); - Scalar igamma_s[][6] = { - {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, - {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, - 9.999500016666262e-05, 1.0}, - {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, - 7.522076445089201e-07, 1.0}, - {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, - 4.166333347221828e-18, 1.0}, - {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, - 0.9991370418689945, 1.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.5013297751014064}}; - Scalar igammac_s[][6] = { - {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, - {1.0, 0.36787944117144233, 0.22313016014842982, - 0.018315638888734182, 0.9999000049998333, 0.0}, - {1.0, 0.5724067044708798, 0.3916251762710878, - 0.04601170568923136, 0.9999992477923555, 0.0}, - {1.0, 0.9810118431238462, 0.9343575456215499, - 0.4334701203667089, 1.0, 0.0}, - {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, - 3.7801620118431334e-07, 0.0008629581310054535, 0.0}, - {1.0, 1.0, 1.0, 1.0, 1.0, 0.49867022490946517}}; + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { - //std::cout << numext::igamma(a_s[i], x_s[j]) << " vs. " << igamma_s[i][j] << std::endl; - //std::cout << numext::igammac(a_s[i], x_s[j]) << " c.vs. " << - //igammac_s[i][j] << std::endl; - std::cout << a_s[i] << ", " << x_s[j] << std::endl; - VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); - VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + } + + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } } } } From 5707004d6b947c202085c3ead889e277264ea36a Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 7 Mar 2016 14:08:56 -0800 Subject: [PATCH 3/4] Fix Eigen's building of sharded tests that use CUDA & more igamma/igammac bugfixes. 0. Prior to this PR, not a single sharded CUDA test was actually being *run*. Fixed that. GPU tests are still failing for igamma/igammac. 1. Add calls for igamma/igammac to TensorBase 2. Fix up CUDA-specific calls of igamma/igammac 3. Add unit tests for digamma, igamma, igammac in CUDA. --- Eigen/src/Core/GenericPacketMath.h | 4 +- Eigen/src/Core/arch/CUDA/MathFunctions.h | 34 ++- Eigen/src/Core/arch/CUDA/PacketMath.h | 1 - cmake/EigenTesting.cmake | 7 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 15 ++ unsupported/test/CMakeLists.txt | 16 +- unsupported/test/cxx11_tensor_cuda.cu | 205 ++++++++++++++++++ 7 files changed, 268 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ead0253df..802def51d 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -460,11 +460,11 @@ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } /*************************************************************************** diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 6e84d3af8..6822700f8 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -116,24 +116,42 @@ double2 perfc(const double2& a) return make_double2(erfc(a.x), erfc(a.y)); } + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma(const float4& a, const float4& x) { - using numext::pigamma; + using numext::igamma; return make_float4( - pigamma(a.x, x.x), - pigamma(a.y, x.y), - pigamma(a.z, x.z), - pigamma(a.w, x.w)); + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double2 pigammac(const double2& a, const double& x) +double2 pigamma(const double2& a, const double2& x) { - using numext::pigammac; - return make_double2(pigammac(a.x, x.x), pigammac(a.y, x.y)); + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d2563030b..25d964600 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -284,7 +284,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 5022397a7..5ca800cfe 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,12 +19,15 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - cuda_add_executable(${targetname} ${filename}) + if (${ARGC} GREATER 2) + cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + else() + cuda_add_executable(${targetname} ${filename}) + endif() else() add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") add_dependencies(eigen2_buildtests ${targetname}) else() diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 4dea1d3a0..aa67b9811 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -315,12 +315,27 @@ class TensorBase operator==(const OtherDerived& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + // comparisons and tests for Scalars EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c202cf0e4..17f83915b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -1,3 +1,17 @@ +# generate split test header file only if it does not yet exist +# in order to prevent a rebuild everytime cmake is configured +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") + foreach(i RANGE 1 999) + file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h + "#ifdef EIGEN_TEST_PART_${i}\n" + "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" + "#else\n" + "#define CALL_SUBTEST_${i}(FUNC)\n" + "#endif\n\n" + ) + endforeach() +endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported") add_custom_target(BuildUnsupported) @@ -158,7 +172,7 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND) - set(CUDA_PROPAGATE_HOST_FLAGS OFF) +# set(CUDA_PROPAGATE_HOST_FLAGS OFF) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) endif() diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 58da21d3b..348271e4b 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -574,6 +574,195 @@ void test_cuda_lgamma(const Scalar stddev) cudaFree(d_out); } +template +void test_cuda_digamma() +{ + Tensor in(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits::infinity(); + expected_out(6) = std::numeric_limits::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } +} + +template +void test_cuda_igamma() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + Tensor expected_out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[i]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + printf("got: %g\n", out(i, j)); + //VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } +} + +template +void test_cuda_igammac() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + Tensor expected_out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[i]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + printf("got: %g\n", out(i, j)); + //VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } +} + template void test_cuda_erf(const Scalar stddev) { @@ -667,30 +856,46 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_2d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); CALL_SUBTEST_4(test_cuda_erf(0.001f)); + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); // CALL_SUBTEST(test_cuda_erfc(100.0f)); CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01f)); CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); CALL_SUBTEST_4(test_cuda_lgamma(100.0)); CALL_SUBTEST_4(test_cuda_lgamma(0.01)); CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + + CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0)); CALL_SUBTEST_4(test_cuda_erf(100.0)); CALL_SUBTEST_4(test_cuda_erf(0.01)); CALL_SUBTEST_4(test_cuda_erf(0.001)); + CALL_SUBTEST_4(test_cuda_erfc(1.0)); // CALL_SUBTEST(test_cuda_erfc(100.0)); CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs CALL_SUBTEST_4(test_cuda_erfc(0.01)); CALL_SUBTEST_4(test_cuda_erfc(0.001)); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); } From 0bb5de05a131e955190e9a408bdc0e00b1929745 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 7 Mar 2016 15:35:09 -0800 Subject: [PATCH 4/4] Finishing touches on igamma/igammac for GPU. Tests now pass. --- Eigen/src/Core/SpecialFunctions.h | 22 ++++++++++++---------- unsupported/test/cxx11_tensor_cuda.cu | 12 ++++-------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 4a61325d4..567c02c61 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -520,14 +520,16 @@ struct igammac_impl { Copyright 1985, 1987, 1992 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ + using std::log; const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; const Scalar machep = igamma_helper::machep(); - const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar maxlog = log(NumTraits::highest()); const Scalar big = igamma_helper::big(); const Scalar biginv = 1 / big; const Scalar nan = NumTraits::quiet_NaN(); + const Scalar inf = NumTraits::infinity(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; @@ -541,11 +543,9 @@ struct igammac_impl { return (one - igamma_impl::run(a, x)); } - using std::isinf; - if ((isinf)(x)) return zero; + if (x == inf) return zero; // std::isinf crashes on CUDA /* Compute x**a * exp(-x) / gamma(a) */ - using std::log; ax = a * log(x) - x - lgamma_impl::run(a); if (ax < -maxlog) { // underflow return zero; @@ -564,7 +564,7 @@ struct igammac_impl { ans = pkm1 / qkm1; using std::abs; - do { + while (true) { c += one; y += one; z += two; @@ -588,7 +588,8 @@ struct igammac_impl { qkm2 *= biginv; qkm1 *= biginv; } - } while (t > machep); + if (t <= machep) break; + } return (ans * ax); } @@ -683,10 +684,11 @@ struct igamma_impl { * k=0 | (a+k+1) * */ + using std::log; const Scalar zero = 0; const Scalar one = 1; const Scalar machep = igamma_helper::machep(); - const Scalar maxlog = ::log(NumTraits::highest()); + const Scalar maxlog = log(NumTraits::highest()); const Scalar nan = NumTraits::quiet_NaN(); double ans, ax, c, r; @@ -702,7 +704,6 @@ struct igamma_impl { } /* Compute x**a * exp(-x) / gamma(a) */ - using std::log; ax = a * log(x) - x - lgamma_impl::run(a); if (ax < -maxlog) { // underflow @@ -716,11 +717,12 @@ struct igamma_impl { c = one; ans = one; - do { + while (true) { r += one; c *= x/r; ans += c; - } while (c/ans > machep); + if (c/ans <= machep) break; + } return (ans * ax / a); } diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 348271e4b..1964d9e07 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -632,7 +632,6 @@ void test_cuda_igamma() Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); - Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -641,7 +640,7 @@ void test_cuda_igamma() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { a(i, j) = a_s[i]; - x(i, j) = x_s[i]; + x(i, j) = x_s[j]; } } @@ -686,8 +685,7 @@ void test_cuda_igamma() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { if ((std::isnan)(igamma_s[i][j])) { - printf("got: %g\n", out(i, j)); - //VERIFY((std::isnan)(out(i, j))); + VERIFY((std::isnan)(out(i, j))); } else { VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); } @@ -701,7 +699,6 @@ void test_cuda_igammac() Tensor a(6, 6); Tensor x(6, 6); Tensor out(6, 6); - Tensor expected_out(6, 6); out.setZero(); Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; @@ -710,7 +707,7 @@ void test_cuda_igammac() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { a(i, j) = a_s[i]; - x(i, j) = x_s[i]; + x(i, j) = x_s[j]; } } @@ -754,8 +751,7 @@ void test_cuda_igammac() for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { if ((std::isnan)(igammac_s[i][j])) { - printf("got: %g\n", out(i, j)); - //VERIFY((std::isnan)(out(i, j))); + VERIFY((std::isnan)(out(i, j))); } else { VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); }