From e70ffef9678f86ef465e93b89351e812ab47311d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 8 Jan 2019 16:26:31 -0800 Subject: [PATCH 1/7] Optimize evalShardedByInnerDim --- .../src/Tensor/TensorContractionThreadPool.h | 197 ++++++++++++++---- 1 file changed, 161 insertions(+), 36 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 3946e2fc4..9666bf167 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -756,6 +756,36 @@ struct TensorEvaluator + EIGEN_STRONG_INLINE void addAllToBuffer(size_t n, const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) const { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + // Decide whether we want to shard m x k x n contraction over the inner // (contraction) dimension (k). static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, @@ -788,50 +818,145 @@ struct TensorEvaluatorm_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; - const Index packet_size = internal::packet_traits::size; - const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + + // We will compute partial results into the buffers of this size. + const Index buffer_size_bytes = m * n * sizeof(Scalar); + // The underlying GEMM kernel assumes that k is a multiple of // the packet size and subtle breakage occurs if this is violated. - Index block_size = kmultiple * divup(k, kmultiple * num_threads); - Index num_blocks = divup(k, block_size); - // we use 'result' for the first block's partial result. - MaxSizeVector block_buffers(num_blocks - 1); - Barrier barrier(internal::convert_index(num_blocks)); - auto process_block = [=, &barrier](Scalar* buf, Index begin, Index end) { - ::memset(buf, 0, m * n * sizeof(Scalar)); + const Index packet_size = internal::packet_traits::size; + + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs and + // gemm_pack_rhs, so we specify minimum desired block size. + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + const Index block_size = numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + const Index num_blocks = divup(k, block_size); + + // Compute block size with accounting for potentially incomplete last block. + const auto actual_block_size = [=](Index block_idx) -> Index { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks finihes, we need to add only these pre-aggregated blocks. + + // Compute range size with accounting for potentially incomplete last range. + const auto actual_range_size = [=](Index num_ranges, Index range_size, + Index range_idx) -> Index { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + const Index l0_ranges = divup(num_blocks, l0_size); + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state(l0_ranges); + for (int i = 0; i < l0_ranges; ++i) { + l0_state.emplace_back(actual_range_size(l0_ranges, l0_size, i)); + } + + MaxSizeVector block_buffers(num_blocks); + + auto process_block = [&, this](Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + ::memset(buf, 0, buffer_size_bytes); + TENSOR_CONTRACTION_DISPATCH( this->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, this->m_device.numThreads())); - barrier.Notify(); + (buf, begin, end, /*num_threads=*/num_blocks)); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actual_range_size(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } }; - Index start = 0; - for (Index blocks_left = num_blocks; blocks_left > 0; --blocks_left) { - // The underlying GEMM kernel assumes that k is a multiple of packet size - // (currently largest packet size is 16) and subtle breakage occurs if - // this is violated. - block_size = kmultiple * divup(k - start, kmultiple * blocks_left); - Scalar* buf; - if (start == 0) { - buf = result; - } else { - buf = static_cast( - this->m_device.allocate(m * n * sizeof(Scalar))); - block_buffers.push_back(buf); - } - Index end = start + block_size; - if (end > k) { - end = k; - } - this->m_device.enqueueNoNotification( - [=, &process_block]() { process_block(buf, start, end); }); - start = end; + + Barrier barrier(internal::convert_index(num_blocks)); + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast( + this->m_device.allocate(buffer_size_bytes)); + block_buffers.push_back(buf); + + Index block_start = block_idx * block_size; + Index block_end = block_start + actual_block_size(block_idx); + + this->m_device.enqueueNoNotification([=, &barrier, &process_block]() { + process_block(block_idx, block_start, block_end); + barrier.Notify(); + }); } barrier.Wait(); - // Add other partial results into first partial result. - for (const auto& buf : block_buffers) { - addToBuffer(m * n, buf, result); - this->m_device.deallocate(buf); + // Aggregate partial sums from l0 ranges. + Index l0_index = 1; + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */block_buffers[0]); + } + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + + // Don't forget to deallocate ALL temporary buffers. + for (Index i = 1; i < num_blocks; ++i) { + this->m_device.deallocate(block_buffers[i]); } // Finally call output kernel with finalized output buffer. From e6b217b8ddf533de9bacc46aae2db6de78581056 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:25:17 +0100 Subject: [PATCH 2/7] bug #1652: implements a much more accurate version of vectorized sin/cos. This new version achieve same speed for SSE/AVX, and is slightly faster with FMA. Guarantees are as follows: - no FMA: 1ULP up to 3pi, 2ULP up to sin(25966) and cos(18838), fallback to std::sin/cos for larger inputs - FMA: 1ULP up to sin(117435.992) and cos(71476.0625), fallback to std::sin/cos for larger inputs --- Eigen/src/Core/GenericPacketMath.h | 27 +++- Eigen/src/Core/arch/AVX/PacketMath.h | 10 ++ .../arch/Default/GenericPacketMathFunctions.h | 152 +++++++++++------- Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++ test/packetmath.cpp | 56 +++++-- 5 files changed, 181 insertions(+), 75 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2b2ee9e2c..9fdd4a2ed 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -393,18 +393,39 @@ typename conditional<(unpacket_traits::size%8)==0,typename unpacket_trai predux_half_dowto4(const Packet& a) { return a; } -/** \internal \returns the product of the elements of \a a*/ +/** \internal \returns the product of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) { return a; } -/** \internal \returns the min of the elements of \a a*/ +/** \internal \returns the min of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) { return a; } -/** \internal \returns the max of the elements of \a a*/ +/** \internal \returns the max of the elements of \a a */ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) { return a; } +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + return bool(predux(a)); +} + /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..ebea63757 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -575,6 +575,16 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) +{ + return _mm256_movemask_ps(x)!=0; +} template struct palign_impl diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 7ceaea894..3c167247e 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -269,88 +269,118 @@ EIGEN_UNUSED Packet psincos_float(const Packet& _x) { typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_1 = pset1(1.0f); - const Packet cst_half = pset1(0.5f); - const PacketI csti_1 = pset1(1); - const PacketI csti_not1 = pset1(~1); - const PacketI csti_2 = pset1(2); - const PacketI csti_3 = pset1(3); - - const Packet cst_sign_mask = pset1frombits(0x80000000u); - - const Packet cst_minus_cephes_DP1 = pset1(-0.78515625f); - const Packet cst_minus_cephes_DP2 = pset1(-2.4187564849853515625e-4f); - const Packet cst_minus_cephes_DP3 = pset1(-3.77489497744594108e-8f); - const Packet cst_sincof_p0 = pset1(-1.9515295891E-4f); - const Packet cst_sincof_p1 = pset1( 8.3321608736E-3f); - const Packet cst_sincof_p2 = pset1(-1.6666654611E-1f); - const Packet cst_coscof_p0 = pset1( 2.443315711809948E-005f); - const Packet cst_coscof_p1 = pset1(-1.388731625493765E-003f); - const Packet cst_coscof_p2 = pset1( 4.166664568298827E-002f); - const Packet cst_cephes_FOPI = pset1( 1.27323954473516f); // 4 / M_PI + const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1(1); + const Packet cst_sign_mask = pset1frombits(0x80000000u); Packet x = pabs(_x); - // Scale x by 4/Pi to find x's octant. - Packet y = pmul(x, cst_cephes_FOPI); + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); - // Get the octant. We'll reduce x by this number of octants or by one more than it. - PacketI y_int = pcast(y); - // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. - // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. - // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). - PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...) - y = pcast(y_int1); + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi // Compute the sign to apply to the polynomial. - // sign = third_bit(y_int1) xor signbit(_x) - Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<29>(y_int1))) - : preinterpret(pshiftleft<29>(padd(y_int1,csti_3))); + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<30>(y_int))) + : preinterpret(pshiftleft<30>(padd(y_int,csti_1))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit - // Get the polynomial selection mask from the second bit of y_int1 + // Get the polynomial selection mask from the second bit of y_int // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet poly_mask = preinterpret(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1))); + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. - // The magic pass: "Extended precision modular arithmetic" - // x = ((x - y * DP1) - y * DP2) - y * DP3 - x = pmadd(y, cst_minus_cephes_DP1, x); - x = pmadd(y, cst_minus_cephes_DP2, x); - x = pmadd(y, cst_minus_cephes_DP3, x); + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1(-1.57079601287841796875f), x); + x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1(-3.140625/2.), x); + // x = pmadd(y, pset1(-0.00048351287841796875), x); + // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + Packet huge_mask = pcmp_le(pset1(huge_th),pabs(_x)); + Packet huge_vals; + if(predux_any(huge_mask)) + { + const int PacketSize = unpacket_traits::size; + #if EIGEN_HAS_CXX11 + alignas(Packet) float vals[PacketSize]; + #else + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + #endif + pstoreu(vals, _x); + for(int k=0; k=huge_th) { + vals[k] = ComputeSine ? std::sin(val) : std::cos(val); + } + } + huge_vals = ploadu(vals); + } Packet x2 = pmul(x,x); - // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4) - Packet y1 = cst_coscof_p0; - y1 = pmadd(y1, x2, cst_coscof_p1); - y1 = pmadd(y1, x2, cst_coscof_p2); - y1 = pmul(y1, x2); - y1 = pmul(y1, x2); - y1 = psub(y1, pmul(x2, cst_half)); - y1 = padd(y1, cst_1); + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1(-0.5f)); + y1 = pmadd(y1, x2, pset1(1.f)); - // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) - Packet y2 = cst_sincof_p0; - y2 = pmadd(y2, x2, cst_sincof_p1); - y2 = pmadd(y2, x2, cst_sincof_p2); + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); y2 = pmul(y2, x2); y2 = pmadd(y2, x, x); - // Select the correct result from the two polynoms. + // Select the correct result from the two polynomials. y = ComputeSine ? pselect(poly_mask,y2,y1) : pselect(poly_mask,y1,y2); - // For very large arguments the the reduction to the [-Pi/4,+Pi/4] range - // does not work thus leading to sine/cosine out of the [-1:1] range. - // Since computing the sine/cosine for very large entry entries makes little - // sense in term of accuracy, we simply clamp to [-1,1]: - y = pmin(y,pset1( 1.f)); - y = pmax(y,pset1(-1.f)); - - // Update the sign - return pxor(y, sign_bit); + // Update the sign and filter huge inputs + return pselect(huge_mask, huge_vals, pxor(y, sign_bit)); } template diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..0003be43b 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -812,6 +812,17 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) +// { +// return _mm_movemask_ps(x) == 0xF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return _mm_movemask_ps(x) != 0x0; +} + #if EIGEN_COMP_GNUC // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 1158c4f9a..22a24039a 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -568,17 +568,22 @@ template void packetmath_real() if(PacketTraits::HasCos) { packet_helper h; - for(Scalar k = 1; k::epsilon(); k*=2) { - data1[0] = k*Scalar(EIGEN_PI) * internal::random(0.8,1.2); - data1[1] = (k+1)*Scalar(EIGEN_PI) * internal::random(0.8,1.2); - h.store(data2, internal::pcos(h.load(data1))); - VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); - VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); - data1[0] = (2*k+1)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); - data1[1] = (2*k+3)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); - h.store(data2, internal::psin(h.load(data1))); - VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); - VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + for(Scalar k = 1; k::epsilon(); k*=2) + { + for(int k1=0;k1<=1; ++k1) + { + data1[0] = (2*k+k1 )*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + data1[1] = (2*k+2+k1)*Scalar(EIGEN_PI)/2 * internal::random(0.8,1.2); + h.store(data2, internal::pcos(h.load(data1))); + h.store(data2+PacketSize, internal::psin(h.load(data1))); + VERIFY(data2[0]<=Scalar(1.) && data2[0]>=Scalar(-1.)); + VERIFY(data2[1]<=Scalar(1.) && data2[1]>=Scalar(-1.)); + VERIFY(data2[PacketSize+0]<=Scalar(1.) && data2[PacketSize+0]>=Scalar(-1.)); + VERIFY(data2[PacketSize+1]<=Scalar(1.) && data2[PacketSize+1]>=Scalar(-1.)); + + VERIFY_IS_APPROX(numext::abs2(data2[0])+numext::abs2(data2[PacketSize+0]), Scalar(1)); + VERIFY_IS_APPROX(numext::abs2(data2[1])+numext::abs2(data2[PacketSize+1]), Scalar(1)); + } } data1[0] = std::numeric_limits::infinity(); @@ -596,6 +601,12 @@ template void packetmath_real() VERIFY((numext::isnan)(data2[0])); h.store(data2, internal::pcos(h.load(data1))); VERIFY((numext::isnan)(data2[0])); + + data1[0] = -Scalar(0.); + h.store(data2, internal::psin(h.load(data1))); + VERIFY( internal::biteq(data2[0], data1[0]) ); + h.store(data2, internal::pcos(h.load(data1))); + VERIFY_IS_EQUAL(data2[0], Scalar(1)); } } } @@ -633,6 +644,29 @@ template void packetmath_notcomplex() ref[i] = data1[0]+Scalar(i); internal::pstore(data2, internal::plset(data1[0])); VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset"); + + { + unsigned char* data1_bits = reinterpret_cast(data1); + // predux_all - not needed yet + // for (unsigned int i=0; i(data1)) && "internal::predux_all(1111)"); + // for(int k=0; k(data1))) && "internal::predux_all(0101)"); + // for (unsigned int i=0; i(data1))) && "internal::predux_any(0000)"); + for(int k=0; k(data1)) && "internal::predux_any(0101)"); + for (unsigned int i=0; i void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) From aeec68f77b61c2d9fb8323ee7951bff3458d5f3f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:36:41 +0100 Subject: [PATCH 3/7] Add missing pcmp_lt and others for AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 72b09d998..564eb97dc 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -283,9 +283,27 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { } #endif +template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); + return cat256(lo, hi); +} + template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = _mm256_cmp_ps(extract256<0>(a), extract256<0>(b), _CMP_NGE_UQ); - __m256 hi = _mm256_cmp_ps(extract256<1>(a), extract256<1>(b), _CMP_NGE_UQ); + __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); + __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); return cat256(lo, hi); } From 3f14e0d19e44d882b21b7c6b2370a22d2b15c7b9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 15:45:21 +0100 Subject: [PATCH 4/7] fix warning --- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 3c167247e..8c6e4f5c7 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -332,8 +332,11 @@ Packet psincos_float(const Packet& _x) // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee #endif - Packet huge_mask = pcmp_le(pset1(huge_th),pabs(_x)); - Packet huge_vals; + // We use huge_vals as a temporary for abs(_x) to ensure huge_vals + // is fully initialized for the last pselect(). (prevent compiler warning) + Packet huge_vals = pabs(_x); + Packet huge_mask = pcmp_le(pset1(huge_th),huge_vals); + if(predux_any(huge_mask)) { const int PacketSize = unpacket_traits::size; From 47810cf5b7286b03084b6ec2fb488c2f3eeddbcc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 16:40:42 +0100 Subject: [PATCH 5/7] Add dedicated implementations of predux_any for AVX512, NEON, and Altivec/VSE --- Eigen/src/Core/arch/AVX512/PacketMath.h | 7 +++++++ Eigen/src/Core/arch/AltiVec/PacketMath.h | 5 +++++ Eigen/src/Core/arch/NEON/PacketMath.h | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 564eb97dc..95eb9d42f 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -969,6 +969,13 @@ EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) +{ + Packet16i xi = _mm512_castps_si512(x); + __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); + return !_mm512_kortestz(tmp,tmp); +} + template struct palign_impl { static EIGEN_STRONG_INLINE void run(Packet16f& first, diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d0ee93f4a..9464264a8 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) return pfirst(res); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return vec_any_ne(x, pzero(x)); +} + template struct palign_impl { diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ed3cec88a..8c3637258 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) return vget_lane_s32(max, 0); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp,tmp),0); +} + // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 #define PALIGN_NEON(Offset,Type,Command) \ From 3492a1ca74552ebfc4e4ed368ebdf2597f9b8452 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 16:53:37 +0100 Subject: [PATCH 6/7] fix plog(+inf) with AVX512 --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index aac707596..c2158c538 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -47,6 +47,7 @@ plog(const Packet16f& _x) { // The smallest non denormalized float number. _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); // Polynomial coefficients. @@ -116,10 +117,16 @@ plog(const Packet16f& _x) { x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN, + // - 0 will be -INF. + // - +INF will be +INF return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), - p16f_minus_inf); + _mm512_mask_blend_ps(invalid_mask, + _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), + p16f_nan), + p16f_minus_inf); } #endif From d812f411c3f99e93a774b80ed3772603303c6c59 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 9 Jan 2019 18:00:05 +0100 Subject: [PATCH 7/7] bug #1654: fix compilation with cuda and no c++11 --- Eigen/src/Core/util/StaticAssert.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 9 +++++---- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_gpu.cu | 8 +++++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index b2f95153e..67714e444 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -104,7 +104,8 @@ STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, - INVALID_TEMPLATE_PARAMETER=1 + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1 }; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index 056665749..5d19652e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -1219,9 +1219,6 @@ template, GpuDevice> : public TensorContractionEvaluatorBase, GpuDevice> > { - static_assert(std::is_same::value, - "GPU tensor contraction does not support output kernels."); - typedef GpuDevice Device; typedef TensorEvaluator, Device> Self; @@ -1274,7 +1271,11 @@ struct TensorEvaluator::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } // We need to redefine this method to make nvcc happy EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cda658e0e..e8e1dc832 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -258,7 +258,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") endif() - if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) + if(( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) AND EIGEN_TEST_CXX11) set(EIGEN_CUDA_CXX11_FLAG "-std=c++11") else() # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11) diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 14fc0bd04..94625e6a3 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -17,6 +17,8 @@ #include +#define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11 + using Eigen::Tensor; void test_gpu_nullary() { @@ -617,6 +619,7 @@ void test_gpu_convolution_3d() } +#if EIGEN_GPU_TEST_C99_MATH template void test_gpu_lgamma(const Scalar stddev) { @@ -655,6 +658,7 @@ void test_gpu_lgamma(const Scalar stddev) gpuFree(d_in); gpuFree(d_out); } +#endif template void test_gpu_digamma() @@ -986,6 +990,7 @@ void test_gpu_igammac() gpuFree(d_out); } +#if EIGEN_GPU_TEST_C99_MATH template void test_gpu_erf(const Scalar stddev) { @@ -1063,6 +1068,7 @@ void test_gpu_erfc(const Scalar stddev) gpuFree(d_in); gpuFree(d_out); } +#endif template void test_gpu_betainc() @@ -1494,7 +1500,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_gpu) CALL_SUBTEST_3(test_gpu_convolution_3d()); #endif -#if __cplusplus > 199711L +#if EIGEN_GPU_TEST_C99_MATH // std::erf, std::erfc, and so on where only added in c++11. We use them // as a golden reference to validate the results produced by Eigen. Therefore // we can only run these tests if we use a c++11 compiler.