From 217d984abc2dd619d9ba0c585f27065e40380fe3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 11 May 2016 10:22:15 -0700 Subject: [PATCH] Fixed a typo in my previous commit --- Eigen/src/Core/functors/UnaryFunctors.h | 48 ++++++++++++++++++- test/CMakeLists.txt | 2 +- .../CXX11/src/Tensor/TensorReductionCuda.h | 2 +- unsupported/test/cxx11_float16.cpp | 11 +++++ .../test/cxx11_tensor_of_float16_cuda.cu | 3 ++ 5 files changed, 63 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 488ebf1d2..3f7a635be 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -616,7 +616,53 @@ template struct scalar_tanh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); } template - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); } + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& _x) const { + /** \internal \returns the hyperbolic tan of \a a (coeff-wise) + Doesn't do anything fancy, just a 13/6-degree rational interpolant which + is accurate up to a couple of ulp in the range [-9, 9], outside of which the + fl(tanh(x)) = +/-1. */ + + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + const Packet plus_9 = pset1(9.0); + const Packet minus_9 = pset1(-9.0); + const Packet x = pmax(minus_9, pmin(plus_9, _x)); + + // The monomial coefficients of the numerator polynomial (odd). + const Packet alpha_1 = pset1(4.89352455891786e-03); + const Packet alpha_3 = pset1(6.37261928875436e-04); + const Packet alpha_5 = pset1(1.48572235717979e-05); + const Packet alpha_7 = pset1(5.12229709037114e-08); + const Packet alpha_9 = pset1(-8.60467152213735e-11); + const Packet alpha_11 = pset1(2.00018790482477e-13); + const Packet alpha_13 = pset1(-2.76076847742355e-16); + + // The monomial coefficients of the denominator polynomial (even). + const Packet beta_0 = pset1(4.89352518554385e-03); + const Packet beta_2 = pset1(2.26843463243900e-03); + const Packet beta_4 = pset1(1.18534705686654e-04); + const Packet beta_6 = pset1(1.19825839466702e-06); + + // Since the polynomials are odd/even, we need x^2. + const Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet p = pmadd(x2, alpha_13, alpha_11); + p = pmadd(x2, p, alpha_9); + p = pmadd(x2, p, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + Packet q = pmadd(x2, beta_6, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); + } }; template struct functor_traits > diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7bed6a45c..3c0f9b685 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -147,7 +147,7 @@ ei_add_test(nomalloc) ei_add_test(first_aligned) ei_add_test(nullary) ei_add_test(mixingtypes) -ei_add_test(packetmath) +ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") ei_add_test(unalignedassert) ei_add_test(vectorization_logic) ei_add_test(basicstuff) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index bbac88192..b433a14c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -318,7 +318,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { output[i] = reducer.initialize(); } - _syncthreads(); + __syncthreads(); } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 9141c4820..e39a7f83c 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -88,6 +88,16 @@ void test_conversion() #endif } +void test_numtraits() +{ + std::cout << "expsilin = " << NumTraits::epsilon() << std::endl; + std::cout << "highest = " << NumTraits::highest() << std::endl; + std::cout << "lowest = " << NumTraits::lowest() << std::endl; + std::cout << "inifinty = " << NumTraits::infinity() << std::endl; + std::cout << "nan = " << NumTraits::quiet_NaN() << std::endl; + +} + void test_arithmetic() { VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); @@ -185,6 +195,7 @@ void test_trigonometric_functions() void test_cxx11_float16() { CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_numtraits()); CALL_SUBTEST(test_arithmetic()); CALL_SUBTEST(test_comparison()); CALL_SUBTEST(test_basic_functions()); diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index dceac793e..0d73318a9 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -388,6 +388,9 @@ void test_cuda_forced_evals() { #endif + + + void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16