From db5c14de424ef3b43c4afb1aedf6a6f8e5640a06 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 09:52:58 -0700 Subject: [PATCH 01/32] Explicitly cast the default value into the proper scalar type. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 9597577b9..6ee9c88b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -607,7 +607,7 @@ class TensorBase const TensorVolumePatchOp extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const { + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); } @@ -619,7 +619,7 @@ class TensorBase const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, const Index padding_top_z, const Index padding_bottom_z, const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const { + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); } From e91f25530117a30e1bf71387c9864e3ac601b9ba Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 10:02:00 -0700 Subject: [PATCH 02/32] Marked variables that's only used in debug mode as such --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 5d73d62d2..1fb27a65b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -111,6 +111,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { + EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } From 7a07d6aa2bd2c6c5a9f93896ad34fba8bd9189fe Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 11:12:17 -0700 Subject: [PATCH 03/32] Small cleanup --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index afde7b3d2..e57ba9d9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; @@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; From 8ef3181f15a9be76ac783bedd2926ee6f4c69a2f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 21 Mar 2016 11:24:05 -0700 Subject: [PATCH 04/32] Worked around a constness related issue --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e57ba9d9d..a9c222ea0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } const TensorEvaluator& impl() const { return m_impl; } From f9ad25e4d8453c4265a5fd6d4962a76a386564df Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 09:30:23 -0700 Subject: [PATCH 05/32] Fixed contractions of 16 bit floats --- Eigen/src/Core/arch/CUDA/Half.h | 20 +++++++++---------- .../CXX11/src/Tensor/TensorContractionCuda.h | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index c385b882a..921c5bcb2 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -73,8 +73,6 @@ struct half : public __half { : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} - EIGEN_DEVICE_FUNC half(const volatile half& h) - : __half(internal::raw_uint16_to_half(h.x)) {} EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); @@ -87,14 +85,6 @@ struct half : public __half { x = other.x; return *this; } - EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) { - x = other.x; - return *this; - } - EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile { - x = other.x; - return *this; - } }; #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -341,4 +331,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { } // end namespace std + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast(__shfl_xor(static_cast(var), laneMask, width)); +} + +#endif + + #endif // EIGEN_HALF_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index a4a06ab5f..dbff660a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -20,7 +20,7 @@ template __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, const Index m_size, const Index n_size, const Index k_size) { const Index m_block_idx = blockIdx.x; @@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, Scalar rrow(7); // Now x corresponds to k, y to m, and z to n - const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] @@ -503,8 +503,8 @@ __launch_bounds__(512) EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { - __shared__ volatile Scalar lhs_shmem[72 * 64]; - __shared__ volatile Scalar rhs_shmem[72 * 64]; + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; From 65a7113a36f70aeca34eac29f32b24ef865cb6e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 09:33:54 -0700 Subject: [PATCH 06/32] Use an enum instead of a static const int to prevent possible link error --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index d6ad65070..6af2d45d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -41,7 +41,7 @@ template <> struct max_n_1<0> { template struct PacketType { typedef typename internal::packet_traits::type type; - static const int size = internal::unpacket_traits::size; + enum { size = internal::unpacket_traits::size }; }; // For CUDA packet types when using a GpuDevice From 6a31b7be3ea29a5300ff575f0bada876b70904d6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:02:50 -0700 Subject: [PATCH 07/32] Avoid using std::vector whenever possible --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index dcbef5b03..b282f5c07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) { + explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } @@ -110,8 +110,8 @@ class ThreadPool : public ThreadPoolInterface { }; std::mutex mu_; - std::vector threads_; // All threads - std::vector waiters_; // Stack of waiting threads. + MaxSizeVector threads_; // All threads + MaxSizeVector waiters_; // Stack of waiting threads. std::deque> pending_; // Queue of pending work std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; From e7a468c5b78295e26d970372336bd5f73c90ae34 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:26:50 -0700 Subject: [PATCH 08/32] Filter some compilation flags that nvcc warns about. --- unsupported/test/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 19893cc25..20048515c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -175,9 +175,14 @@ endif() # These tests needs nvcc find_package(CUDA 7.0) if(CUDA_FOUND AND EIGEN_TEST_NVCC) - # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") From bc2b8027514b27b3c67800d5c951e5d532f76f02 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 14:27:34 -0700 Subject: [PATCH 09/32] Fixed a couple of typos --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index b282f5c07..4d803c95b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) { + explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } From 002cf0d1c979857e057879d8c84b92439dbcc90d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 15:24:23 -0700 Subject: [PATCH 10/32] Use a single Barrier instead of a collection of Notifications to reduce the thread synchronization overhead --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 75 +++++++++++++++---- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 10 +-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 25 +++---- 4 files changed, 73 insertions(+), 38 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 969f25481..16132398d 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_THREADS +#include #include #include #include diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index dcbef5b03..e4165bbf8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -118,47 +118,82 @@ class ThreadPool : public ThreadPoolInterface { }; -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object. -// but only one caller must call Notify() on the object. -class Notification { +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { public: - Notification() : notified_(false) {} - ~Notification() {} + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_assert((state_>>1) == 0); + } void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } std::unique_lock l(mu_); eigen_assert(!notified_); notified_ = true; cv_.notify_all(); } - void WaitForNotification() { + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; std::unique_lock l(mu_); - cv_.wait(l, [this]() { return notified_; } ); + while (!notified_) { + cv_.wait(l); + } } private: std::mutex mu_; std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag bool notified_; }; + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + // Runs an arbitrary function and then calls Notify() on the passed in // Notification. -template struct FunctionWrapper +template struct FunctionWrapperWithNotification { static void run(Notification* n, Function f, Args... args) { f(args...); - n->Notify(); + if (n) { + n->Notify(); + } } }; -static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) { +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { if (n) { - n->WaitForNotification(); + n->Wait(); } } @@ -203,10 +238,20 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); std::function func = - std::bind(&FunctionWrapper::run, n, f, args...); + std::bind(&FunctionWrapperWithNotification::run, n, f, args...); pool_->Schedule(func); return n; } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, + Args&&... args) const { + std::function func = std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...); + pool_->Schedule(func); + } + template EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { std::function func = std::bind(f, args...); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 54da77bcf..6bbf235cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -127,20 +127,16 @@ class TensorExecutor const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); + device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } if (numblocks * blocksize < size) { EvalRange::run(evaluator, numblocks * blocksize, size); } - for (int i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } - + barrier.Wait(); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index fe1dc22ee..489451215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -256,12 +256,11 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back( - device.enqueue(&FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i])); + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, + i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; @@ -271,10 +270,7 @@ struct FullReducer { } else { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + barrier.Wait(); for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } @@ -307,12 +303,12 @@ struct FullReducer { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - MaxSizeVector results(numblocks); + Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&FullReducerShard::run, - self, i * blocksize, blocksize, reducer, - &shards[i])); + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); } typename Self::CoeffReturnType finalShard; if (numblocks * blocksize < num_coeffs) { @@ -322,10 +318,7 @@ struct FullReducer { finalShard = reducer.initialize(); } - for (Index i = 0; i < numblocks; ++i) { - wait_until_ready(results[i]); - delete results[i]; - } + barrier.Wait(); for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } From 3d1e857327a3ef5cfa8b65f2204c28bf405731d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 15:48:28 -0700 Subject: [PATCH 11/32] Fixed compilation error --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 4b8eda6bb..23b1765ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -27,7 +27,7 @@ class ThreadPoolInterface { class ThreadPool : public ThreadPoolInterface { public: // Construct a pool that contains "num_threads" threads. - explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) { + explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { threads_.push_back(new std::thread([this]() { WorkerLoop(); })); } From 28e02996df54240d44ead1bf827b867c22a224a9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Mar 2016 16:53:57 -0700 Subject: [PATCH 12/32] Merged patch 672 from Justin Lebar: Don't use long doubles with cuda --- test/main.h | 2 ++ unsupported/test/cxx11_tensor_argmax_cuda.cu | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/main.h b/test/main.h index 2797e8623..e5f1a9ad5 100644 --- a/test/main.h +++ b/test/main.h @@ -331,11 +331,13 @@ inline bool test_isApprox(const std::complex& a, const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } +#ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const std::complex& a, const std::complex& b) { return internal::isApprox(a, b, test_precision >()); } inline bool test_isMuchSmallerThan(const std::complex& a, const std::complex& b) { return internal::isMuchSmallerThan(a, b, test_precision >()); } #endif +#endif #ifndef EIGEN_TEST_NO_LONGDOUBLE inline bool test_isApprox(const long double& a, const long double& b) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 45311d4f7..41ccbe974 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. +#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU From 9642fd7a937942037a3ea0d3c51b799be197782f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 23 Mar 2016 15:37:45 +0100 Subject: [PATCH 13/32] Replace all M_PI by EIGEN_PI and add a check to the testsuite. --- test/main.h | 4 ++++ unsupported/Eigen/OpenGLSupport | 4 ++-- unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h | 8 ++------ unsupported/Eigen/src/MatrixFunctions/MatrixPower.h | 4 ++-- unsupported/test/matrix_function.cpp | 4 ++-- unsupported/test/matrix_power.cpp | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/main.h b/test/main.h index e5f1a9ad5..bba5e7570 100644 --- a/test/main.h +++ b/test/main.h @@ -58,6 +58,10 @@ #define isnan(X) please_protect_your_isnan_with_parentheses #define isinf(X) please_protect_your_isinf_with_parentheses #define isfinite(X) please_protect_your_isfinite_with_parentheses +#ifdef M_PI +#undef M_PI +#endif +#define M_PI please_use_EIGEN_PI_instead_of_M_PI #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport index 288c6b0fb..87f50947d 100644 --- a/unsupported/Eigen/OpenGLSupport +++ b/unsupported/Eigen/OpenGLSupport @@ -180,11 +180,11 @@ template void glLoadMatrix(const Transform& rot) { - glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f); + glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f); } inline void glRotate(const Rotation2D& rot) { - glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0); + glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0); } template void glRotate(const RotationBase& rot) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 463d7be0c..e43e86e90 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -11,10 +11,6 @@ #ifndef EIGEN_MATRIX_LOGARITHM #define EIGEN_MATRIX_LOGARITHM -#ifndef M_PI -#define M_PI 3.141592653589793238462643383279503L -#endif - namespace Eigen { namespace internal { @@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI))); - result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y; + int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI))); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; } } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index 1e5a59c55..f37d31c3f 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -298,8 +298,8 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI)); - ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber); + int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 487d5a9b8..9a995f941 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * M_PI) - scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) + scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index baf183d12..8e104ed1e 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -24,7 +24,7 @@ void test2dRotation(double tol) s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / M_PI); + C = Apow(std::ldexp(angle,1) / EIGEN_PI); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); } From 6971146ca9e4b5870404974397a81d125b2418d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:44:52 -0700 Subject: [PATCH 14/32] Added more conversion operators for half floats --- Eigen/src/Core/arch/CUDA/Half.h | 55 ++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 921c5bcb2..f997735aa 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -65,20 +65,61 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); struct half : public __half { EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} - // TODO(sesse): Should these conversions be marked as explicit? - EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} - EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast(i))) {} - EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast(d))) {} - EIGEN_DEVICE_FUNC half(bool b) - : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + explicit EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(int i) + : __half(internal::float_to_half_rtne(static_cast(i))) {} + explicit EIGEN_DEVICE_FUNC half(long l) + : __half(internal::float_to_half_rtne(static_cast(l))) {} + explicit EIGEN_DEVICE_FUNC half(long long ll) + : __half(internal::float_to_half_rtne(static_cast(ll))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : __half(internal::float_to_half_rtne(f)) {} + explicit EIGEN_DEVICE_FUNC half(double d) + : __half(internal::float_to_half_rtne(static_cast(d))) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return static_cast(x & 0x7fff); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(internal::half_to_float(*this)); + } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { return internal::half_to_float(*this); } EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return internal::half_to_float(*this); + return static_cast(internal::half_to_float(*this)); } EIGEN_DEVICE_FUNC half& operator=(const half& other) { From 0e6888260459b31dac1bd3411b0e8f688f6d22a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:46:42 -0700 Subject: [PATCH 15/32] Added the ability to divide a half float by an index --- Eigen/src/Core/arch/CUDA/Half.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index f997735aa..08f6005e4 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -234,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { #endif // Emulate support for half floats +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return Eigen::half(static_cast(a) / static_cast(b)); +} + // Conversion routines, including fallbacks for the host or older CUDA. // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of // these in hardware. If we need more performance on older/other CPUs, they are From fc3660285fe326744eb67711126d2764a1f97100 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 09:56:50 -0700 Subject: [PATCH 16/32] Made type conversion explicit --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 4c0433267..b2a9724de 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); #else - half r1 = a.x; - half r2 = a.y; + half r1 = static_cast(a.x); + half r2 = static_cast(a.y); half2 r; r.x = 0; r.x |= r1.x; From 2062ee2d269eac5ff78f70ac3133d0a47c22d9fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 13:39:00 -0700 Subject: [PATCH 17/32] Added a test to verify that notifications are working properly --- unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_notification.cpp | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_notification.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 20048515c..6bd8cfb92 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_argmax) ei_add_test(cxx11_tensor_shuffling) ei_add_test(cxx11_tensor_striding) + ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_random) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp new file mode 100644 index 000000000..961d4edf6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -0,0 +1,72 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Vijay Vasudevan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include +#include +#include "main.h" +#include + +namespace { + +void WaitAndAdd(Eigen::Notification* n, int* counter) { + n->Wait(); + *counter = *counter + 1; +} + +} // namespace + +static void test_notification_single() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + sleep(1); + + // The thread should be waiting for the notification. + VERIFY_IS_EQUAL(counter, 0); + + // Unblock the thread + n.Notify(); + + sleep(1); + + // Verify the counter has been incremented + VERIFY_IS_EQUAL(counter, 1); +} + +// Like test_notification_single() but enqueues multiple threads to +// validate that all threads get notified by Notify(). +static void test_notification_multiple() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + sleep(1); + VERIFY_IS_EQUAL(counter, 0); + n.Notify(); + sleep(1); + VERIFY_IS_EQUAL(counter, 4); +} + +void test_cxx11_tensor_notification() +{ + CALL_SUBTEST(test_notification_single()); + CALL_SUBTEST(test_notification_multiple()); +} From 7168afde5e9c3b05823b939a499c6752d2db10f7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 14:21:04 -0700 Subject: [PATCH 18/32] Made the tensor benchmarks compile on MacOS --- bench/tensors/tensor_benchmarks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index d916f787e..a4f97728d 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -333,7 +333,7 @@ template class BenchmarkSuite { #ifndef EIGEN_HAS_INDEX_LIST Eigen::array sum_along_dim; - sum_along_dim = 1; + sum_along_dim[0] = 1; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -356,7 +356,7 @@ template class BenchmarkSuite { input_size[1] = n_; const TensorMap, Eigen::Aligned> B( b_, input_size); - const Eigen::array output_size; + Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); From 7a570e50ef0a79d52d5762d086954564afce9d61 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:00:06 -0700 Subject: [PATCH 19/32] Fixed contractions of fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 08f6005e4..61131828f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -63,7 +63,7 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); // Class definition. struct half : public __half { - EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} + EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 29b5637e7..cb917bb37 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -134,7 +134,7 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } -/* + void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -181,7 +181,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); -}*/ +} void test_cuda_reductions() { @@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_conversion()); CALL_SUBTEST_1(test_cuda_unary()); CALL_SUBTEST_1(test_cuda_elementwise()); -// CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); } else { From bff8cbad068a74f1a1f7aa0e80e4424c6353c9fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:14:23 -0700 Subject: [PATCH 20/32] Removed executable bit from header files --- Eigen/src/Core/AssignEvaluator.h | 0 Eigen/src/Core/Assign_MKL.h | 0 Eigen/src/Core/ProductEvaluators.h | 0 Eigen/src/Core/VectorwiseOp.h | 0 Eigen/src/Core/products/GeneralMatrixVector_MKL.h | 0 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h | 0 7 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Core/AssignEvaluator.h mode change 100755 => 100644 Eigen/src/Core/Assign_MKL.h mode change 100755 => 100644 Eigen/src/Core/ProductEvaluators.h mode change 100755 => 100644 Eigen/src/Core/VectorwiseOp.h mode change 100755 => 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h mode change 100755 => 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h old mode 100755 new mode 100644 From 81d340984ae40642eed46cbfb3a817d841d85de1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:15:02 -0700 Subject: [PATCH 21/32] Removed executable bit from header files --- Eigen/src/Eigenvalues/ComplexSchur_MKL.h | 0 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 0 Eigen/src/Eigenvalues/RealQZ.h | 0 Eigen/src/Eigenvalues/RealSchur_MKL.h | 0 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0 Eigen/src/PardisoSupport/PardisoSupport.h | 0 Eigen/src/QR/ColPivHouseholderQR_MKL.h | 0 Eigen/src/SVD/JacobiSVD.h | 0 Eigen/src/SparseLU/SparseLU.h | 0 9 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 Eigen/src/Eigenvalues/ComplexSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealQZ.h mode change 100755 => 100644 Eigen/src/Eigenvalues/RealSchur_MKL.h mode change 100755 => 100644 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h mode change 100755 => 100644 Eigen/src/PardisoSupport/PardisoSupport.h mode change 100755 => 100644 Eigen/src/QR/ColPivHouseholderQR_MKL.h mode change 100755 => 100644 Eigen/src/SVD/JacobiSVD.h mode change 100755 => 100644 Eigen/src/SparseLU/SparseLU.h diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h old mode 100755 new mode 100644 diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h old mode 100755 new mode 100644 diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h old mode 100755 new mode 100644 From 393bc3b16b413598b6c9dcbae722aafb5672d457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:22:15 -0700 Subject: [PATCH 22/32] Added comment --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index efe688e50..579519b04 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -13,7 +13,7 @@ // The array class is only available starting with cxx11. Emulate our own here -// if needed. +// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) From 9bc9396e88789e86647227353e10d90d5316fa98 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:30:06 -0700 Subject: [PATCH 23/32] Use portable includes --- unsupported/test/cxx11_tensor_notification.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index 961d4edf6..813bc4413 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -10,7 +10,7 @@ #define EIGEN_USE_THREADS #include -#include +#include #include "main.h" #include From 92693b50eb09cdaeecb830a06df6d6c67a369477 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:40:36 -0700 Subject: [PATCH 24/32] Fixed compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 6bbf235cc..f71625ae5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -125,7 +125,7 @@ class TensorExecutor int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; + const unsigned int numblocks = static_cast(size / blocksize); Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 489451215..0ce2517d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -253,7 +253,7 @@ struct FullReducer { return; } else { const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); @@ -300,7 +300,7 @@ struct FullReducer { return; } const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); From 41434a8a852e3f2744164bece0487d7e767717b6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:52:38 -0700 Subject: [PATCH 25/32] Avoid unnecessary conversions --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f71625ae5..3408933bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -128,7 +128,7 @@ class TensorExecutor const unsigned int numblocks = static_cast(size / blocksize); Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 0ce2517d6..9875601ba 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -258,7 +258,7 @@ struct FullReducer { Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); } @@ -271,7 +271,7 @@ struct FullReducer { finalShard = reducer.initialize(); } barrier.Wait(); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -305,7 +305,7 @@ struct FullReducer { Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); @@ -319,7 +319,7 @@ struct FullReducer { } barrier.Wait(); - for (Index i = 0; i < numblocks; ++i) { + for (unsigned int i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); From 2e4e4cb74de19de0997567a4d058d1522ec7e452 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 16:57:12 -0700 Subject: [PATCH 26/32] Use numext::abs instead of abs to avoid incorrect conversion to integer of the argument --- Eigen/src/Core/SpecialFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index c12e41a7b..37ebb5915 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -576,7 +576,7 @@ struct igammac_impl { pkm1 = pk; qkm2 = qkm1; qkm1 = qk; - if (abs(pk) > big) { + if (numext::abs(pk) > big) { pkm2 *= biginv; pkm1 *= biginv; qkm2 *= biginv; From 044efea965b484fcf13551c8edabdb62c4b4b462 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 23 Mar 2016 20:02:11 -0700 Subject: [PATCH 27/32] Made sure that the cxx11_tensor_cuda test can be compiled even without support for cxx11. --- unsupported/test/cxx11_tensor_cuda.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 1964d9e07..4d8465756 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_3(test_cuda_convolution_3d()); CALL_SUBTEST_3(test_cuda_convolution_3d()); +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); @@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_4(test_cuda_digamma()); + CALL_SUBTEST_4(test_cuda_erf(1.0f)); CALL_SUBTEST_4(test_cuda_erf(100.0f)); CALL_SUBTEST_4(test_cuda_erf(0.01f)); @@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_5(test_cuda_igamma()); CALL_SUBTEST_5(test_cuda_igammac()); +#endif } From 0968e925a040d4988f02e8476b5cea8518e5f966 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 24 Mar 2016 18:00:33 -0700 Subject: [PATCH 28/32] Updated the benchmarking code to use Eigen::half instead of half --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 49f75472a..35c6f7489 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -12,7 +12,7 @@ StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -41,7 +41,7 @@ BM_FuncGPU(colReduction); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, D1, D2, D3); \ + BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ suite.FUNC(iters); \ } \ @@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); StopBenchmarkTiming(); \ Eigen::CudaStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ - BenchmarkSuite suite(device, N); \ + BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ suite.FUNC(iters, DIM1, DIM2); \ } \ @@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4); BM_FuncWithKernelDimsGPU(convolution, 4, 7); BM_FuncWithKernelDimsGPU(convolution, 7, 64); BM_FuncWithKernelDimsGPU(convolution, 64, 7); -*/ \ No newline at end of file +*/ From a86c9f037b24312863ad2a74a583369581c6e21a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 24 Mar 2016 18:54:31 -0700 Subject: [PATCH 29/32] Fixed compilation error on windows --- unsupported/test/cxx11_tensor_notification.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index 813bc4413..c946007b8 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -9,11 +9,20 @@ #define EIGEN_USE_THREADS -#include #include #include "main.h" #include +#if EIGEN_OS_WIN || EIGEN_OS_WIN64 +#include +void sleep(int seconds) { + Sleep(seconds*1000); +} +#else +#include +#endif + + namespace { void WaitAndAdd(Eigen::Notification* n, int* counter) { From d94f6ba9659f8c953caaff854552070ce149958b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 11:02:56 -0700 Subject: [PATCH 30/32] Started to model the cost of divisions more accurately. --- Eigen/src/Core/NumTraits.h | 17 +++++++++++++++++ Eigen/src/Core/functors/BinaryFunctors.h | 12 +++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 7ddb4a867..b7b5e7d22 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -60,6 +60,23 @@ template struct GenericNumTraits MulCost = 1 }; + // Division is messy but important, because it is expensive and throughput + // varies significantly. The following numbers are based on min division + // throughput on Haswell. + template + struct Div { + enum { +#ifdef EIGEN_VECTORIZE_AVX + AVX = true, +#else + AVX = false, +#endif + Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)): + Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8 + }; + }; + + typedef T Real; typedef typename internal::conditional< IsInteger, diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 5cdfff845..d04323bb0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -238,7 +238,13 @@ template struct scalar_hypot_op { }; template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess=0 }; + enum + { + Cost = 3 * NumTraits::AddCost + + 2 * NumTraits::MulCost + + 2 * NumTraits::template Div::Cost, + PacketAccess = false + }; }; /** \internal @@ -564,6 +570,10 @@ struct scalar_inverse_mult_op { { return internal::pdiv(pset1(m_other),a); } Scalar m_other; }; +template +struct functor_traits > +{ enum { PacketAccess = packet_traits::HasDiv, Cost = NumTraits::template Div::Cost }; }; + } // end namespace internal From 65716e99a5763f536257eb1dd047f34f8172f816 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 11:13:53 -0700 Subject: [PATCH 31/32] Improved the cost estimate of the quotient op --- Eigen/src/Core/functors/BinaryFunctors.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index d04323bb0..e28fecfd0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -303,9 +303,10 @@ template struct scalar_quotient_op { }; template struct functor_traits > { + typedef typename scalar_quotient_op::result_type result_type; enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost), // rough estimate! - PacketAccess = scalar_quotient_op::Vectorizable + PacketAccess = scalar_quotient_op::Vectorizable, + Cost = NumTraits::template Div::Cost }; }; From 74f91ed06c615fc7d875bd30cb72ea5e08504be2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Mar 2016 17:21:56 -0700 Subject: [PATCH 32/32] Improved support for integer modulo --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index c71a30d21..eb0c8d1ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,7 +25,20 @@ struct scalar_mod_op { }; template struct functor_traits > -{ enum { Cost = 2 * NumTraits::MulCost, PacketAccess = false }; }; +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::template Div::Cost, PacketAccess = false }; }; /** \internal