diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 72234288e..4f1ff6b03 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -65,6 +65,7 @@ struct default_packet_traits HasCmp = 0, HasDiv = 0, + HasReciprocal = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -816,13 +817,6 @@ Packet plog2(const Packet& a) { template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt(const Packet& a) { return numext::sqrt(a); } -/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet prsqrt(const Packet& a) { - typedef typename internal::unpacket_traits::type Scalar; - return pdiv(pset1(Scalar(1)), psqrt(a)); -} - /** \internal \returns the rounded value of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pround(const Packet& a) { using numext::round; return round(a); } @@ -1035,6 +1029,19 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } +/** \internal \returns 1 / a (coeff-wise) */ +template +EIGEN_DEVICE_FUNC inline Packet preciprocal(const Packet& a) { + using Scalar = typename unpacket_traits::type; + return pdiv(pset1(Scalar(1)), a); +} + +/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet prsqrt(const Packet& a) { + return preciprocal(psqrt(a)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 2c9bbb57c..182dd371e 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -17,6 +17,35 @@ namespace Eigen { namespace internal { +/** \internal Fast reciprocal using Newton-Raphson's method. + We assume that the starting guess provided in approx_a_recip has at least + half the leading mantissa bits in the correct result, such that a single + Newton-Raphson step is sufficient to get within 1-2 ulps of the currect result. +*/ +template +struct generic_reciprocal_newton_step { + static_assert(Steps > 0, "Steps must be at least 1."); + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& a, const Packet& approx_a_recip) { + using Scalar = typename unpacket_traits::type; + const Packet two = pset1(Scalar(2)); + const Packet neg_a = pnegate(a); + // Refine the approximation using one Newton-Raphson step: + // x_{i} = x_{i-1} * (2 - a * x_{i-1}) + const Packet x = + generic_reciprocal_newton_step::run(a, approx_a_recip); + return pmul(x, pmadd(neg_a, x, two)); + } +}; + +template +struct generic_reciprocal_newton_step { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet + run(const Packet& /*unused*/, const Packet& approx_a_recip) { + return approx_a_recip; + } +}; + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which is accurate up to a couple of ulps in the (approximate) range [-8, 8], diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index d517dffb7..9f61af634 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -166,19 +166,12 @@ Packet8f prsqrt(const Packet8f& _x) { return pselect(not_normal_finite_mask, y_approx, y_newton); } -#else -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& _x) { - EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x)); +template<> EIGEN_STRONG_INLINE Packet8f preciprocal(const Packet8f& a) { + return generic_reciprocal_newton_step::run(a, _mm256_rcp_ps(a)); } + #endif -template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& _x) { - EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x)); -} F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) @@ -190,6 +183,7 @@ F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt) +F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal) template <> EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) { @@ -214,6 +208,7 @@ BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal) template <> EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 6b20da6ae..bf832c9c2 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -78,6 +78,7 @@ template<> struct packet_traits : default_packet_traits HasCmp = 1, HasDiv = 1, + HasReciprocal = EIGEN_FAST_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 54be6cfc4..6caed2d76 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -253,13 +253,6 @@ prsqrt(const Packet16f& _x) { // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); } -#else - -template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); - return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); -} #endif F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt) @@ -304,12 +297,17 @@ prsqrt(const Packet8d& _x) { // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); } + +template<> EIGEN_STRONG_INLINE Packet16f preciprocal(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512ER + return _mm512_rcp28_ps(a)); #else -template <> -EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { - EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); - return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); + return generic_reciprocal_newton_step::run(a, _mm512_rcp14_ps(a)); +#endif } + +F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal) #endif template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index ff0dd29b4..8a00c626c 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -120,6 +120,7 @@ template<> struct packet_traits : default_packet_traits HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, + HasReciprocal = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, #endif diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 5a063d358..10bc107e0 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -148,21 +148,13 @@ Packet4f prsqrt(const Packet4f& _x) { return pselect(not_normal_finite_mask, y_approx, y_newton); } -#else - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation. - return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); -} - #endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) { - return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); +template<> EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a) { + return generic_reciprocal_newton_step::run(a, _mm_rcp_ps(a)); } + // Hyperbolic Tangent function. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a843226ac..4de3d4707 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -136,6 +136,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, + HasReciprocal = EIGEN_FAST_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index d56aae5c9..e7bccafe0 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -723,13 +723,18 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; } template EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const - { return internal::pdiv(pset1(Scalar(1)),a); } + { return internal::preciprocal(a); } }; template struct functor_traits > { enum { PacketAccess = packet_traits::HasDiv, - Cost = scalar_div_cost::value + // If packet_traits::HasReciprocal then the Estimated cost is that + // of computing an approximation plus a single Newton-Raphson step, which + // consists of 1 pmul + 1 pmadd. + Cost = (packet_traits::HasReciprocal + ? 4 * NumTraits::MulCost + : scalar_div_cost::value) }; }; @@ -1033,7 +1038,7 @@ struct scalar_logistic_op { } }; -// TODO(rmlarsen): Enable the following on host when integer_packet is defined +// TODO(rmlarsen): Enable the following on host when integer_packet is defined // for the relevant packet types. #ifdef EIGEN_GPU_CC diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 220da03d4..69f5e79e5 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -58,10 +58,10 @@ struct compute_inverse_size4(data); - Packet4f L2_ = ploadt(data + stride*4); - Packet4f L3_ = ploadt(data + stride*8); - Packet4f L4_ = ploadt(data + stride*12); + Packet4f L1 = ploadt(data); + Packet4f L2 = ploadt(data + stride*4); + Packet4f L3 = ploadt(data + stride*8); + Packet4f L4 = ploadt(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -70,17 +70,17 @@ struct compute_inverse_size4(1.0f), det); + Packet4f rd = preciprocal(det); // Four sub-matrices of the inverse Packet4f iA, iB, iC, iD; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index fe0a9f67c..455ecab09 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -28,6 +28,10 @@ inline T REF_DIV(const T& a, const T& b) { return a / b; } template +inline T REF_RECIPROCAL(const T& a) { + return T(1) / a; +} +template inline T REF_ABS_DIFF(const T& a, const T& b) { return a > b ? a - b : b - a; } @@ -464,9 +468,11 @@ void packetmath() { CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul); CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); - if (PacketTraits::HasNegate) CHECK_CWISE1(internal::negate, internal::pnegate); + CHECK_CWISE1_IF(PacketTraits::HasNegate, internal::negate, internal::pnegate); + CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal); CHECK_CWISE1(numext::conj, internal::pconj); + for (int offset = 0; offset < 3; ++offset) { for (int i = 0; i < PacketSize; ++i) ref[i] = data1[offset]; internal::pstore(data2, internal::pset1(data1[offset])); diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index 86f057118..3eb061dff 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -19,9 +19,7 @@ template void inverse_permutation_4x4() { MatrixType m = PermutationMatrix<4>(indices); MatrixType inv = m.inverse(); - double error = double( (m*inv-MatrixType::Identity()).norm() / NumTraits::epsilon() ); - EIGEN_DEBUG_VAR(error) - VERIFY(error == 0.0); + VERIFY_IS_APPROX(m*inv, MatrixType::Identity()); std::next_permutation(indices.data(),indices.data()+4); } } diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index e6180420e..c1609f1bd 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -601,13 +601,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two( ScalarType(6.79019408009981274425e-9) }; const T eight = pset1(ScalarType(8.0)); - const T one = pset1(ScalarType(1)); const T neg_two = pset1(ScalarType(-2)); T x, x0, x1, z; x = psqrt(pmul(neg_two, plog(b))); x0 = psub(x, pdiv(plog(x), x)); - z = pdiv(one, x); + z = preciprocal(x); x1 = pmul( z, pselect( pcmp_lt(x, eight), diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 27c284514..ddc813242 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -130,7 +130,7 @@ static void test_3d() Tensor mat4(2,3,7); mat4 = mat2 * 3.14f; Tensor mat5(2,3,7); - mat5 = mat1.inverse().log(); + mat5 = (mat1 + mat1.constant(1)).inverse().log(); Tensor mat6(2,3,7); mat6 = mat2.pow(0.5f) * 3.14f; Tensor mat7(2,3,7); @@ -150,7 +150,7 @@ static void test_3d() for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), val + val); VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); - VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val)); + VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/(val + 1))); VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);