diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 7ddb4a867..b7b5e7d22 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -60,6 +60,23 @@ template struct GenericNumTraits MulCost = 1 }; + // Division is messy but important, because it is expensive and throughput + // varies significantly. The following numbers are based on min division + // throughput on Haswell. + template + struct Div { + enum { +#ifdef EIGEN_VECTORIZE_AVX + AVX = true, +#else + AVX = false, +#endif + Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)): + Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8 + }; + }; + + typedef T Real; typedef typename internal::conditional< IsInteger, diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index c12e41a7b..37ebb5915 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -576,7 +576,7 @@ struct igammac_impl { pkm1 = pk; qkm2 = qkm1; qkm1 = qk; - if (abs(pk) > big) { + if (numext::abs(pk) > big) { pkm2 *= biginv; pkm1 *= biginv; qkm2 *= biginv; diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index c385b882a..61131828f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -63,38 +63,69 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); // Class definition. struct half : public __half { - EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} + EIGEN_DEVICE_FUNC half() {} - // TODO(sesse): Should these conversions be marked as explicit? - EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} - EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast(i))) {} - EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast(d))) {} - EIGEN_DEVICE_FUNC half(bool b) - : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} - EIGEN_DEVICE_FUNC half(const volatile half& h) - : __half(internal::raw_uint16_to_half(h.x)) {} + explicit EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(int i) + : __half(internal::float_to_half_rtne(static_cast(i))) {} + explicit EIGEN_DEVICE_FUNC half(long l) + : __half(internal::float_to_half_rtne(static_cast(l))) {} + explicit EIGEN_DEVICE_FUNC half(long long ll) + : __half(internal::float_to_half_rtne(static_cast(ll))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : __half(internal::float_to_half_rtne(f)) {} + explicit EIGEN_DEVICE_FUNC half(double d) + : __half(internal::float_to_half_rtne(static_cast(d))) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return static_cast(x & 0x7fff); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(internal::half_to_float(*this)); + } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return internal::half_to_float(*this); + return static_cast(internal::half_to_float(*this)); } EIGEN_DEVICE_FUNC half& operator=(const half& other) { x = other.x; return *this; } - EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) { - x = other.x; - return *this; - } - EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile { - x = other.x; - return *this; - } }; #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -203,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { #endif // Emulate support for half floats +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return Eigen::half(static_cast(a) / static_cast(b)); +} + // Conversion routines, including fallbacks for the host or older CUDA. // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of // these in hardware. If we need more performance on older/other CPUs, they are @@ -341,4 +378,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { } // end namespace std + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast(__shfl_xor(static_cast(var), laneMask, width)); +} + +#endif + + #endif // EIGEN_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 4c0433267..b2a9724de 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); #else - half r1 = a.x; - half r2 = a.y; + half r1 = static_cast(a.x); + half r2 = static_cast(a.y); half2 r; r.x = 0; r.x |= r1.x; diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 5cdfff845..e28fecfd0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -238,7 +238,13 @@ template struct scalar_hypot_op { }; template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess=0 }; + enum + { + Cost = 3 * NumTraits::AddCost + + 2 * NumTraits::MulCost + + 2 * NumTraits::template Div::Cost, + PacketAccess = false + }; }; /** \internal @@ -297,9 +303,10 @@ template struct scalar_quotient_op { }; template struct functor_traits > { + typedef typename scalar_quotient_op::result_type result_type; enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost), // rough estimate! - PacketAccess = scalar_quotient_op::Vectorizable + PacketAccess = scalar_quotient_op::Vectorizable, + Cost = NumTraits::template Div::Cost }; }; @@ -564,6 +571,10 @@ struct scalar_inverse_mult_op { { return internal::pdiv(pset1(m_other),a); } Scalar m_other; }; +template +struct functor_traits > +{ enum { PacketAccess = packet_traits::HasDiv, Cost = NumTraits::template Div::Cost }; }; + } // end namespace internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100755 new mode 100644 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100755 new mode 100644 diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index d916f787e..a4f97728d 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -333,7 +333,7 @@ template class BenchmarkSuite { #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; - sum_along_dim = 1; + sum_along_dim[0] = 1; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -356,7 +356,7 @@ template class BenchmarkSuite { input_size[1] = n_; const TensorMap, Eigen::Aligned> B( b_, input_size); - const Eigen::array output_size; + Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 49f75472a..35c6f7489 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -12,7 +12,7 @@ StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -41,7 +41,7 @@ BM_FuncGPU(colReduction); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, D1, D2, D3); \ + BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ } \ @@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4); BM_FuncWithKernelDimsGPU(convolution, 4, 7); BM_FuncWithKernelDimsGPU(convolution, 7, 64); BM_FuncWithKernelDimsGPU(convolution, 64, 7); -*/ \ No newline at end of file +*/ diff --git a/test/main.h b/test/main.h index 2797e8623..bba5e7570 100644 --- a/test/main.h +++ b/test/main.h @@ -58,6 +58,10 @@ #define isnan(X) please_protect_your_isnan_with_parentheses #define isinf(X) please_protect_your_isinf_with_parentheses #define isfinite(X) please_protect_your_isfinite_with_parentheses +#ifdef M_PI +#undef M_PI +#endif +#define M_PI please_use_EIGEN_PI_instead_of_M_PI #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h @@ -331,11 +335,13 @@ inline bool test_isApprox(const std::complex& a, const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } +#ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const std::complex& a, const std::complex& b) { return internal::isApprox(a, b, test_precision >()); } inline bool test_isMuchSmallerThan(const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } #endif +#endif #ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const long double& a, const long double& b) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 969f25481..16132398d 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_THREADS +#include #include #include #include diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index efe688e50..579519b04 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -13,7 +13,7 @@ // The array class is only available starting with cxx11. Emulate our own here -// if needed. +// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 9597577b9..6ee9c88b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -607,7 +607,7 @@ class TensorBase const TensorVolumePatchOp extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const { + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); } @@ -619,7 +619,7 @@ class TensorBase const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, const Index padding_top_z, const Index padding_bottom_z, const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const { + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index a4a06ab5f..dbff660a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -20,7 +20,7 @@ template __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, const Index m_size, const Index n_size, const Index k_size) { const Index m_block_idx = blockIdx.x; @@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, Scalar rrow(7); // Now x corresponds to k, y to m, and z to n - const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] @@ -503,8 +503,8 @@ __launch_bounds__(512) EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { - __shared__ volatile Scalar lhs_shmem[72 * 64]; - __shared__ volatile Scalar rhs_shmem[72 * 64]; + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index dcbef5b03..23b1765ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) { + explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } @@ -110,55 +110,90 @@ class ThreadPool : public ThreadPoolInterface { }; std::mutex mu_; - std::vector threads_; // All threads - std::vector waiters_; // Stack of waiting threads. + MaxSizeVector threads_; // All threads + MaxSizeVector waiters_; // Stack of waiting threads. std::deque> pending_; // Queue of pending work std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; }; -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object. -// but only one caller must call Notify() on the object. -class Notification { +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { public: - Notification() : notified_(false) {} - ~Notification() {} + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_assert((state_>>1) == 0); + } void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } std::unique_lock l(mu_); eigen_assert(!notified_); notified_ = true; cv_.notify_all(); } - void WaitForNotification() { + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; std::unique_lock l(mu_); - cv_.wait(l, [this]() { return notified_; } ); + while (!notified_) { + cv_.wait(l); + } } private: std::mutex mu_; std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag bool notified_; }; + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + // Runs an arbitrary function and then calls Notify() on the passed in // Notification. -template struct FunctionWrapper +template struct FunctionWrapperWithNotification { static void run(Notification* n, Function f, Args... args) { f(args...); - n->Notify(); + if (n) { + n->Notify(); + } } }; -static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) { +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { if (n) { - n->WaitForNotification(); + n->Wait(); } } @@ -203,10 +238,20 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); std::function func = - std::bind(&FunctionWrapper::run, n, f, args...); + std::bind(&FunctionWrapperWithNotification::run, n, f, args...); pool_->Schedule(func); return n; } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, + Args&&... args) const { + std::function func = std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...); + pool_->Schedule(func); + } + template EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { std::function func = std::bind(f, args...); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 5d73d62d2..1fb27a65b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -111,6 +111,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { + EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 54da77bcf..3408933bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -125,22 +125,18 @@ class TensorExecutor int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; + const unsigned int numblocks = static_cast(size / blocksize); - MaxSizeVector results(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); + Barrier barrier(numblocks); + for (unsigned int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } if (numblocks * blocksize < size) { EvalRange::run(evaluator, numblocks * blocksize, size); } - for (int i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } - + barrier.Wait(); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index c71a30d21..eb0c8d1ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,7 +25,20 @@ struct scalar_mod_op { }; template struct functor_traits > -{ enum { Cost = 2 * NumTraits::MulCost, PacketAccess = false }; }; +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; /** \internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index d6ad65070..6af2d45d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -41,7 +41,7 @@ template <> struct max_n_1<0> { template struct PacketType { typedef typename internal::packet_traits::type type; - static const int size = internal::unpacket_traits::size; + enum { size = internal::unpacket_traits::size }; }; // For CUDA packet types when using a GpuDevice diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index afde7b3d2..a9c222ea0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; @@ -143,7 +142,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } const TensorEvaluator& impl() const { return m_impl; } @@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fe1dc22ee..9875601ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -253,15 +253,14 @@ struct FullReducer { return; } else { const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { - results.push_back( - device.enqueue(&FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i])); + for (unsigned int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, + i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; @@ -271,11 +270,8 @@ struct FullReducer { } else { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } - for (Index i = 0; i < numblocks; ++i) { + barrier.Wait(); + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -304,15 +300,15 @@ struct FullReducer { return; } const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard::run, - self, i * blocksize, blocksize, reducer, - &shards[i])); + for (unsigned int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); } typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { @@ -322,11 +318,8 @@ struct FullReducer { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } - for (Index i = 0; i < numblocks; ++i) { + barrier.Wait(); + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport index 288c6b0fb..87f50947d 100644 --- a/unsupported/Eigen/OpenGLSupport +++ b/unsupported/Eigen/OpenGLSupport @@ -180,11 +180,11 @@ template void glLoadMatrix(const Transform& rot) { - glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f); + glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f); } inline void glRotate(const Rotation2D& rot) { - glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0); + glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0); } template void glRotate(const RotationBase& rot) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 463d7be0c..e43e86e90 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -11,10 +11,6 @@ #ifndef EIGEN_MATRIX_LOGARITHM #define EIGEN_MATRIX_LOGARITHM -#ifndef M_PI -#define M_PI 3.141592653589793238462643383279503L -#endif - namespace Eigen { namespace internal { @@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI))); - result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y; + int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI))); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; } } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index 1e5a59c55..f37d31c3f 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -298,8 +298,8 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI)); - ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber); + int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 19893cc25..6bd8cfb92 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_argmax) ei_add_test(cxx11_tensor_shuffling) ei_add_test(cxx11_tensor_striding) + ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_random) @@ -175,9 +176,14 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND AND EIGEN_TEST_NVCC) - # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 45311d4f7..41ccbe974 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. +#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 1964d9e07..4d8465756 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); @@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); @@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_5(test_cuda_igamma()); CALL_SUBTEST_5(test_cuda_igammac()); +#endif } diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp new file mode 100644 index 000000000..c946007b8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Vijay Vasudevan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include +#include "main.h" +#include + +#if EIGEN_OS_WIN || EIGEN_OS_WIN64 +#include +void sleep(int seconds) { + Sleep(seconds*1000); +} +#else +#include +#endif + + +namespace { + +void WaitAndAdd(Eigen::Notification* n, int* counter) { + n->Wait(); + *counter = *counter + 1; +} + +} // namespace + +static void test_notification_single() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + sleep(1); + + // The thread should be waiting for the notification. + VERIFY_IS_EQUAL(counter, 0); + + // Unblock the thread + n.Notify(); + + sleep(1); + + // Verify the counter has been incremented + VERIFY_IS_EQUAL(counter, 1); +} + +// Like test_notification_single() but enqueues multiple threads to +// validate that all threads get notified by Notify(). +static void test_notification_multiple() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + sleep(1); + VERIFY_IS_EQUAL(counter, 0); + n.Notify(); + sleep(1); + VERIFY_IS_EQUAL(counter, 4); +} + +void test_cxx11_tensor_notification() +{ + CALL_SUBTEST(test_notification_single()); + CALL_SUBTEST(test_notification_multiple()); +} diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 29b5637e7..cb917bb37 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -134,7 +134,7 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } -/* + void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -181,7 +181,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); -}*/ +} void test_cuda_reductions() { @@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_conversion()); CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); -// CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); } else { diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 487d5a9b8..9a995f941 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * M_PI) - scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) + scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index baf183d12..8e104ed1e 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -24,7 +24,7 @@ void test2dRotation(double tol) s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / M_PI); + C = Apow(std::ldexp(angle,1) / EIGEN_PI); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); }