From db5c14de424ef3b43c4afb1aedf6a6f8e5640a06 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 21 Mar 2016 09:52:58 -0700
Subject: [PATCH 01/32] Explicitly cast the default value into the proper
 scalar type.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 9597577b9..6ee9c88b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -607,7 +607,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
     extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
                            const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
-                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = 0) const {
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
       return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
     }
 
@@ -619,7 +619,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
                            const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
                            const Index padding_top_z, const Index padding_bottom_z,
                            const Index padding_top, const Index padding_bottom,
-                           const Index padding_left, const Index padding_right, const Scalar padding_value = 0) const {
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
       return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
     }
 

From e91f25530117a30e1bf71387c9864e3ac601b9ba Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 21 Mar 2016 10:02:00 -0700
Subject: [PATCH 02/32] Marked variables that's only used in debug mode as such

---
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 5d73d62d2..1fb27a65b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
   }

From 7a07d6aa2bd2c6c5a9f93896ad34fba8bd9189fe Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 21 Mar 2016 11:12:17 -0700
Subject: [PATCH 03/32] Small cleanup

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index afde7b3d2..e57ba9d9d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -54,7 +54,6 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
 {
   public:
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
@@ -234,7 +233,6 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 {
   public:
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;

From 8ef3181f15a9be76ac783bedd2926ee6f4c69a2f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 21 Mar 2016 11:24:05 -0700
Subject: [PATCH 04/32] Worked around a constness related issue

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index e57ba9d9d..a9c222ea0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -142,7 +142,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.template packet<LoadMode>(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 

From f9ad25e4d8453c4265a5fd6d4962a76a386564df Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 09:30:23 -0700
Subject: [PATCH 05/32] Fixed contractions of 16 bit floats

---
 Eigen/src/Core/arch/CUDA/Half.h               | 20 +++++++++----------
 .../CXX11/src/Tensor/TensorContractionCuda.h  | 10 +++++-----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index c385b882a..921c5bcb2 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -73,8 +73,6 @@ struct half : public __half {
       : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
   EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half(const volatile half& h)
-      : __half(internal::raw_uint16_to_half(h.x)) {}
 
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
     return internal::half_to_float(*this);
@@ -87,14 +85,6 @@ struct half : public __half {
     x = other.x;
     return *this;
   }
-  EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) {
-    x = other.x;
-    return *this;
-  }
-  EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile {
-    x = other.x;
-    return *this;
-  }
 };
 
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
@@ -341,4 +331,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) {
 
 } // end namespace std
 
+
+// Add the missing shfl_xor intrinsic
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+__device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+}
+
+#endif
+
+
 #endif // EIGEN_HALF_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index a4a06ab5f..dbff660a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -20,7 +20,7 @@ template<typename Scalar, typename Index, typename LhsMapper,
          typename RhsMapper, typename OutputMapper, bool needs_edge_check>
 __device__ EIGEN_STRONG_INLINE void
 EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
                        const Index m_size, const Index n_size, const Index k_size) {
 
   const Index m_block_idx = blockIdx.x;
@@ -319,8 +319,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     Scalar rrow(7);
 
     // Now x corresponds to k, y to m, and z to n
-    const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
 
 #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
 #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
@@ -503,8 +503,8 @@ __launch_bounds__(512)
 EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
                        const OutputMapper output,
                        const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ volatile Scalar lhs_shmem[72 * 64];
-  __shared__ volatile Scalar rhs_shmem[72 * 64];
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
 
   const Index m_block_idx = blockIdx.x;
   const Index n_block_idx = blockIdx.y;

From 65a7113a36f70aeca34eac29f32b24ef865cb6e4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 09:33:54 -0700
Subject: [PATCH 06/32] Use an enum instead of a static const int to prevent
 possible link error

---
 unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index d6ad65070..6af2d45d4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -41,7 +41,7 @@ template <> struct max_n_1<0> {
 template <typename Scalar, typename Device>
 struct PacketType {
   typedef typename internal::packet_traits<Scalar>::type type;
-  static const int size = internal::unpacket_traits<type>::size;
+  enum { size = internal::unpacket_traits<type>::size };
 };
 
 // For CUDA packet types when using a GpuDevice

From 6a31b7be3ea29a5300ff575f0bada876b70904d6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 14:02:50 -0700
Subject: [PATCH 07/32] Avoid using std::vector whenever possible

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index dcbef5b03..b282f5c07 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -27,7 +27,7 @@ class ThreadPoolInterface {
 class ThreadPool : public ThreadPoolInterface {
  public:
   // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) {
+  explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) {
     for (int i = 0; i < num_threads; i++) {
       threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
     }
@@ -110,8 +110,8 @@ class ThreadPool : public ThreadPoolInterface {
   };
 
   std::mutex mu_;
-  std::vector<std::thread*> threads_;               // All threads
-  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
+  MaxSizeVector<std::thread*> threads_;             // All threads
+  MaxSizeVector<Waiter*> waiters_;                  // Stack of waiting threads.
   std::deque<std::function<void()>> pending_;       // Queue of pending work
   std::condition_variable empty_;                   // Signaled on pending_.empty()
   bool exiting_ = false;

From e7a468c5b78295e26d970372336bd5f73c90ae34 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 14:26:50 -0700
Subject: [PATCH 08/32] Filter some compilation flags that nvcc warns about.

---
 unsupported/test/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 19893cc25..20048515c 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -175,9 +175,14 @@ endif()
 # These tests needs nvcc
 find_package(CUDA 7.0)
 if(CUDA_FOUND AND EIGEN_TEST_NVCC)
-  # Mke sure to compile without the -pedantic and -Wundef flags since they trigger thousands of compilation warnings in the CUDA runtime
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
   string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
   message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
 
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")

From bc2b8027514b27b3c67800d5c951e5d532f76f02 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 14:27:34 -0700
Subject: [PATCH 09/32] Fixed a couple of typos

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index b282f5c07..4d803c95b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -27,7 +27,7 @@ class ThreadPoolInterface {
 class ThreadPool : public ThreadPoolInterface {
  public:
   // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) : threads_(num_threads), waiting_(num_threads) {
+  explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) {
     for (int i = 0; i < num_threads; i++) {
       threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
     }

From 002cf0d1c979857e057879d8c84b92439dbcc90d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 15:24:23 -0700
Subject: [PATCH 10/32] Use a single Barrier instead of a collection of
 Notifications to reduce the thread synchronization overhead

---
 unsupported/Eigen/CXX11/Tensor                |  1 +
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 75 +++++++++++++++----
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   | 10 +--
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 25 +++----
 4 files changed, 73 insertions(+), 38 deletions(-)

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 969f25481..16132398d 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -51,6 +51,7 @@ typedef unsigned __int64 uint64_t;
 #endif
 
 #ifdef EIGEN_USE_THREADS
+#include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <mutex>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index dcbef5b03..e4165bbf8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -118,47 +118,82 @@ class ThreadPool : public ThreadPoolInterface {
 };
 
 
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object.
-// but only one caller must call Notify() on the object.
-class Notification {
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+class Barrier {
  public:
-  Notification() : notified_(false) {}
-  ~Notification() {}
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() {
+    eigen_assert((state_>>1) == 0);
+  }
 
   void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      eigen_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
     std::unique_lock<std::mutex> l(mu_);
     eigen_assert(!notified_);
     notified_ = true;
     cv_.notify_all();
   }
 
-  void WaitForNotification() {
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
     std::unique_lock<std::mutex> l(mu_);
-    cv_.wait(l, [this]() { return notified_; } );
+    while (!notified_) {
+      cv_.wait(l);
+    }
   }
 
  private:
   std::mutex mu_;
   std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
   bool notified_;
 };
 
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1) {};
+};
+
+
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
-template <typename Function, typename... Args> struct FunctionWrapper
+template <typename Function, typename... Args> struct FunctionWrapperWithNotification
 {
   static void run(Notification* n, Function f, Args... args) {
     f(args...);
-    n->Notify();
+    if (n) {
+      n->Notify();
+    }
   }
 };
 
-static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
+{
+  static void run(Barrier* b, Function f, Args... args) {
+    f(args...);
+    if (b) {
+      b->Notify();
+    }
+  }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
   if (n) {
-    n->WaitForNotification();
+    n->Wait();
   }
 }
 
@@ -203,10 +238,20 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
     std::function<void()> func =
-      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+      std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
     pool_->Schedule(func);
     return n;
   }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
+                                                Function&& f,
+                                                Args&&... args) const {
+    std::function<void()> func = std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
+    pool_->Schedule(func);
+  }
+
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
     std::function<void()> func = std::bind(f, args...);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 54da77bcf..6bbf235cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -127,20 +127,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
       const Index numblocks = size / blocksize;
 
-      MaxSizeVector<Notification*> results(numblocks);
+      Barrier barrier(numblocks);
       for (int i = 0; i < numblocks; ++i) {
-        results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
+        device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
       }
 
       if (numblocks * blocksize < size) {
         EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
       }
 
-      for (int i = 0; i < numblocks; ++i) {
-        wait_until_ready(results[i]);
-        delete results[i];
-      }
-
+      barrier.Wait();
     }
     evaluator.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index fe1dc22ee..489451215 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -256,12 +256,11 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
       eigen_assert(num_coeffs >= numblocks * blocksize);
 
-      MaxSizeVector<Notification*> results(numblocks);
+      Barrier barrier(numblocks);
       MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
       for (Index i = 0; i < numblocks; ++i) {
-        results.push_back(
-            device.enqueue(&FullReducerShard<Self, Op, false>::run, self,
-                           i * blocksize, blocksize, reducer, &shards[i]));
+        device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
+                                    i * blocksize, blocksize, reducer, &shards[i]);
       }
 
       typename Self::CoeffReturnType finalShard;
@@ -271,10 +270,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       } else {
         finalShard = reducer.initialize();
       }
-      for (Index i = 0; i < numblocks; ++i) {
-        wait_until_ready(results[i]);
-        delete results[i];
-      }
+      barrier.Wait();
       for (Index i = 0; i < numblocks; ++i) {
         reducer.reduce(shards[i], &finalShard);
       }
@@ -307,12 +303,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    MaxSizeVector<Notification*> results(numblocks);
+    Barrier barrier(numblocks);
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
-      results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
-                                       self, i * blocksize, blocksize, reducer,
-                                       &shards[i]));
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+                                  self, i * blocksize, blocksize, reducer,
+                                  &shards[i]);
     }
     typename Self::CoeffReturnType finalShard;
     if (numblocks * blocksize < num_coeffs) {
@@ -322,10 +318,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
       finalShard = reducer.initialize();
     }
 
-    for (Index i = 0; i < numblocks; ++i) {
-      wait_until_ready(results[i]);
-      delete results[i];
-    }
+    barrier.Wait();
     for (Index i = 0; i < numblocks; ++i) {
       reducer.reduce(shards[i], &finalShard);
     }

From 3d1e857327a3ef5cfa8b65f2204c28bf405731d4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 15:48:28 -0700
Subject: [PATCH 11/32] Fixed compilation error

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 4b8eda6bb..23b1765ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -27,7 +27,7 @@ class ThreadPoolInterface {
 class ThreadPool : public ThreadPoolInterface {
  public:
   // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) : threads_(num_threads, NULL), waiting_(num_threads, NULL) {
+  explicit ThreadPool(int num_threads) : threads_(num_threads), waiters_(num_threads) {
     for (int i = 0; i < num_threads; i++) {
       threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
     }

From 28e02996df54240d44ead1bf827b867c22a224a9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 22 Mar 2016 16:53:57 -0700
Subject: [PATCH 12/32] Merged patch 672 from Justin Lebar: Don't use long
 doubles with cuda

---
 test/main.h                                  | 2 ++
 unsupported/test/cxx11_tensor_argmax_cuda.cu | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/main.h b/test/main.h
index 2797e8623..e5f1a9ad5 100644
--- a/test/main.h
+++ b/test/main.h
@@ -331,11 +331,13 @@ inline bool test_isApprox(const std::complex<double>& a, const std::complex<doub
 inline bool test_isMuchSmallerThan(const std::complex<double>& a, const std::complex<double>& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<std::complex<double> >()); }
 
+#ifndef EIGEN_TEST_NO_LONGDOUBLE
 inline bool test_isApprox(const std::complex<long double>& a, const std::complex<long double>& b)
 { return internal::isApprox(a, b, test_precision<std::complex<long double> >()); }
 inline bool test_isMuchSmallerThan(const std::complex<long double>& a, const std::complex<long double>& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<std::complex<long double> >()); }
 #endif
+#endif
 
 #ifndef EIGEN_TEST_NO_LONGDOUBLE
 inline bool test_isApprox(const long double& a, const long double& b)
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 45311d4f7..41ccbe974 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// TODO(mdevin): Free the cuda memory.
 
+#define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
 #define EIGEN_USE_GPU
 

From 9642fd7a937942037a3ea0d3c51b799be197782f Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Wed, 23 Mar 2016 15:37:45 +0100
Subject: [PATCH 13/32] Replace all M_PI by EIGEN_PI and add a check to the
 testsuite.

---
 test/main.h                                             | 4 ++++
 unsupported/Eigen/OpenGLSupport                         | 4 ++--
 unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h | 8 ++------
 unsupported/Eigen/src/MatrixFunctions/MatrixPower.h     | 4 ++--
 unsupported/test/matrix_function.cpp                    | 4 ++--
 unsupported/test/matrix_power.cpp                       | 2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/main.h b/test/main.h
index e5f1a9ad5..bba5e7570 100644
--- a/test/main.h
+++ b/test/main.h
@@ -58,6 +58,10 @@
 #define isnan(X) please_protect_your_isnan_with_parentheses
 #define isinf(X) please_protect_your_isinf_with_parentheses
 #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI please_use_EIGEN_PI_instead_of_M_PI
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index 288c6b0fb..87f50947d 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@@ -180,11 +180,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompa
 
 inline void glRotate(const Rotation2D<float>& rot)
 {
-  glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
+  glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
 }
 inline void glRotate(const Rotation2D<double>& rot)
 {
-  glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
+  glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
 }
 
 template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 463d7be0c..e43e86e90 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -11,10 +11,6 @@
 #ifndef EIGEN_MATRIX_LOGARITHM
 #define EIGEN_MATRIX_LOGARITHM
 
-#ifndef M_PI
-#define M_PI 3.141592653589793238462643383279503L
-#endif
-
 namespace Eigen { 
 
 namespace internal { 
@@ -65,8 +61,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
-    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y;
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
   }
 }
 
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index 1e5a59c55..f37d31c3f 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -298,8 +298,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
-  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber);
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 487d5a9b8..9a995f941 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
 
   MatrixType scaledA;
   RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * M_PI)
-    scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
+    scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
   else
     scaledA = A;
 
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index baf183d12..8e104ed1e 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -24,7 +24,7 @@ void test2dRotation(double tol)
     s = std::sin(angle);
     B << c, s, -s, c;
 
-    C = Apow(std::ldexp(angle,1) / M_PI);
+    C = Apow(std::ldexp(angle,1) / EIGEN_PI);
     std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
     VERIFY(C.isApprox(B, tol));
   }

From 6971146ca9e4b5870404974397a81d125b2418d4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 09:44:52 -0700
Subject: [PATCH 14/32] Added more conversion operators for half floats

---
 Eigen/src/Core/arch/CUDA/Half.h | 55 ++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 921c5bcb2..f997735aa 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -65,20 +65,61 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h);
 struct half : public __half {
   EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {}
 
-  // TODO(sesse): Should these conversions be marked as explicit?
-  EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {}
-  EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
-  EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
-  EIGEN_DEVICE_FUNC half(bool b)
-      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
   EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
 
+  explicit EIGEN_DEVICE_FUNC half(bool b)
+      : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  explicit EIGEN_DEVICE_FUNC half(int i)
+      : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
+  explicit EIGEN_DEVICE_FUNC half(long l)
+      : __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
+  explicit EIGEN_DEVICE_FUNC half(long long ll)
+      : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : __half(internal::float_to_half_rtne(f)) {}
+  explicit EIGEN_DEVICE_FUNC half(double d)
+      : __half(internal::float_to_half_rtne(static_cast<float>(d))) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
+    // +0.0 and -0.0 become false, everything else becomes true.
+    return static_cast<bool>(x & 0x7fff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
+    return static_cast<signed char>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
+    return static_cast<unsigned char>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
+    return static_cast<short>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
+    return static_cast<unsigned short>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
+    return static_cast<int>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
+    return static_cast<unsigned int>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
+    return static_cast<long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+    return static_cast<unsigned long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+    return static_cast<long long>(internal::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+    return static_cast<unsigned long long>(internal::half_to_float(*this));
+  }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
     return internal::half_to_float(*this);
   }
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return internal::half_to_float(*this);
+    return static_cast<double>(internal::half_to_float(*this));
   }
 
   EIGEN_DEVICE_FUNC half& operator=(const half& other) {

From 0e6888260459b31dac1bd3411b0e8f688f6d22a2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 09:46:42 -0700
Subject: [PATCH 15/32] Added the ability to divide a half float by an index

---
 Eigen/src/Core/arch/CUDA/Half.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index f997735aa..08f6005e4 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -234,6 +234,12 @@ static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
 
 #endif  // Emulate support for half floats
 
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+static inline EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return Eigen::half(static_cast<float>(a) / static_cast<float>(b));
+}
+
 // Conversion routines, including fallbacks for the host or older CUDA.
 // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
 // these in hardware. If we need more performance on older/other CPUs, they are

From fc3660285fe326744eb67711126d2764a1f97100 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 09:56:50 -0700
Subject: [PATCH 16/32] Made type conversion explicit

---
 Eigen/src/Core/arch/CUDA/TypeCasting.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index 4c0433267..b2a9724de 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -114,8 +114,8 @@ template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   return __float22half2_rn(make_float2(a.x, a.y));
 #else
-  half r1 = a.x;
-  half r2 = a.y;
+  half r1 = static_cast<half>(a.x);
+  half r2 = static_cast<half>(a.y);
   half2 r;
   r.x = 0;
   r.x |= r1.x;

From 2062ee2d269eac5ff78f70ac3133d0a47c22d9fa Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 13:39:00 -0700
Subject: [PATCH 17/32] Added a test to verify that notifications are working
 properly

---
 unsupported/test/CMakeLists.txt               |  1 +
 .../test/cxx11_tensor_notification.cpp        | 72 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 unsupported/test/cxx11_tensor_notification.cpp

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 20048515c..6bd8cfb92 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -149,6 +149,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_argmax)
   ei_add_test(cxx11_tensor_shuffling)
   ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_ref)
   ei_add_test(cxx11_tensor_random)
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
new file mode 100644
index 000000000..961d4edf6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -0,0 +1,72 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <unistd.h>
+#include <malloc.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+namespace {
+
+void WaitAndAdd(Eigen::Notification* n, int* counter) {
+  n->Wait();
+  *counter = *counter + 1;
+}
+
+}  // namespace
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  sleep(1);
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  sleep(1);
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+void test_cxx11_tensor_notification()
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}

From 7168afde5e9c3b05823b939a499c6752d2db10f7 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 14:21:04 -0700
Subject: [PATCH 18/32] Made the tensor benchmarks compile on MacOS

---
 bench/tensors/tensor_benchmarks.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index d916f787e..a4f97728d 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -333,7 +333,7 @@ template <typename Device, typename T> class BenchmarkSuite {
 
 #ifndef EIGEN_HAS_INDEX_LIST
     Eigen::array<TensorIndex, 1> sum_along_dim;
-    sum_along_dim = 1;
+    sum_along_dim[0] = 1;
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
@@ -356,7 +356,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     input_size[1] = n_;
     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
         b_, input_size);
-    const Eigen::array<TensorIndex, 0> output_size;
+    Eigen::array<TensorIndex, 0> output_size;
     TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
 

From 7a570e50ef0a79d52d5762d086954564afce9d61 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:00:06 -0700
Subject: [PATCH 19/32] Fixed contractions of fp16

---
 Eigen/src/Core/arch/CUDA/Half.h                  | 2 +-
 unsupported/test/cxx11_tensor_of_float16_cuda.cu | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 08f6005e4..61131828f 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -63,7 +63,7 @@ static inline EIGEN_DEVICE_FUNC float half_to_float(__half h);
 
 // Class definition.
 struct half : public __half {
-  EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {}
+  EIGEN_DEVICE_FUNC half() {}
 
   EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {}
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 29b5637e7..cb917bb37 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -134,7 +134,7 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_float);
 }
 
-/*
+
 void test_cuda_contractions() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -181,7 +181,7 @@ void test_cuda_contractions() {
   gpu_device.deallocate(d_float2);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
-}*/
+}
 
 
 void test_cuda_reductions() {
@@ -244,7 +244,7 @@ void test_cxx11_tensor_of_float16_cuda()
     CALL_SUBTEST_1(test_cuda_conversion());
     CALL_SUBTEST_1(test_cuda_unary());
     CALL_SUBTEST_1(test_cuda_elementwise());
-//    CALL_SUBTEST_2(test_cuda_contractions());
+    CALL_SUBTEST_2(test_cuda_contractions());
     CALL_SUBTEST_3(test_cuda_reductions());
   }
   else {

From bff8cbad068a74f1a1f7aa0e80e4424c6353c9fb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:14:23 -0700
Subject: [PATCH 20/32] Removed executable bit from header files

---
 Eigen/src/Core/AssignEvaluator.h                      | 0
 Eigen/src/Core/Assign_MKL.h                           | 0
 Eigen/src/Core/ProductEvaluators.h                    | 0
 Eigen/src/Core/VectorwiseOp.h                         | 0
 Eigen/src/Core/products/GeneralMatrixVector_MKL.h     | 0
 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h | 0
 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h  | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 Eigen/src/Core/AssignEvaluator.h
 mode change 100755 => 100644 Eigen/src/Core/Assign_MKL.h
 mode change 100755 => 100644 Eigen/src/Core/ProductEvaluators.h
 mode change 100755 => 100644 Eigen/src/Core/VectorwiseOp.h
 mode change 100755 => 100644 Eigen/src/Core/products/GeneralMatrixVector_MKL.h
 mode change 100755 => 100644 Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
 mode change 100755 => 100644 Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
old mode 100755
new mode 100644

From 81d340984ae40642eed46cbfb3a817d841d85de1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:15:02 -0700
Subject: [PATCH 21/32] Removed executable bit from header files

---
 Eigen/src/Eigenvalues/ComplexSchur_MKL.h           | 0
 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h     | 0
 Eigen/src/Eigenvalues/RealQZ.h                     | 0
 Eigen/src/Eigenvalues/RealSchur_MKL.h              | 0
 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h | 0
 Eigen/src/PardisoSupport/PardisoSupport.h          | 0
 Eigen/src/QR/ColPivHouseholderQR_MKL.h             | 0
 Eigen/src/SVD/JacobiSVD.h                          | 0
 Eigen/src/SparseLU/SparseLU.h                      | 0
 9 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 Eigen/src/Eigenvalues/ComplexSchur_MKL.h
 mode change 100755 => 100644 Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
 mode change 100755 => 100644 Eigen/src/Eigenvalues/RealQZ.h
 mode change 100755 => 100644 Eigen/src/Eigenvalues/RealSchur_MKL.h
 mode change 100755 => 100644 Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
 mode change 100755 => 100644 Eigen/src/PardisoSupport/PardisoSupport.h
 mode change 100755 => 100644 Eigen/src/QR/ColPivHouseholderQR_MKL.h
 mode change 100755 => 100644 Eigen/src/SVD/JacobiSVD.h
 mode change 100755 => 100644 Eigen/src/SparseLU/SparseLU.h

diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
old mode 100755
new mode 100644
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
old mode 100755
new mode 100644

From 393bc3b16b413598b6c9dcbae722aafb5672d457 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:22:15 -0700
Subject: [PATCH 22/32] Added comment

---
 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index efe688e50..579519b04 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -13,7 +13,7 @@
 
 
 // The array class is only available starting with cxx11. Emulate our own here
-// if needed.
+// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
 #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
 

From 9bc9396e88789e86647227353e10d90d5316fa98 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:30:06 -0700
Subject: [PATCH 23/32] Use portable includes

---
 unsupported/test/cxx11_tensor_notification.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index 961d4edf6..813bc4413 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -10,7 +10,7 @@
 #define EIGEN_USE_THREADS
 
 #include <unistd.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 

From 92693b50eb09cdaeecb830a06df6d6c67a369477 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:40:36 -0700
Subject: [PATCH 24/32] Fixed compilation warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h  | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 6bbf235cc..f71625ae5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -125,7 +125,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 
       int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
       const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-      const Index numblocks = size / blocksize;
+      const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
 
       Barrier barrier(numblocks);
       for (int i = 0; i < numblocks; ++i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 489451215..0ce2517d6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -253,7 +253,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       return;
     } else {
       const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-      const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+      const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
       eigen_assert(num_coeffs >= numblocks * blocksize);
 
       Barrier barrier(numblocks);
@@ -300,7 +300,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
       return;
     }
     const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
     Barrier barrier(numblocks);

From 41434a8a852e3f2744164bece0487d7e767717b6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:52:38 -0700
Subject: [PATCH 25/32] Avoid unnecessary conversions

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h  | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f71625ae5..3408933bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -128,7 +128,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
 
       Barrier barrier(numblocks);
-      for (int i = 0; i < numblocks; ++i) {
+      for (unsigned int i = 0; i < numblocks; ++i) {
         device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
       }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 0ce2517d6..9875601ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -258,7 +258,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
 
       Barrier barrier(numblocks);
       MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-      for (Index i = 0; i < numblocks; ++i) {
+      for (unsigned int i = 0; i < numblocks; ++i) {
         device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
                                     i * blocksize, blocksize, reducer, &shards[i]);
       }
@@ -271,7 +271,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
         finalShard = reducer.initialize();
       }
       barrier.Wait();
-      for (Index i = 0; i < numblocks; ++i) {
+      for (unsigned int i = 0; i < numblocks; ++i) {
         reducer.reduce(shards[i], &finalShard);
       }
       *output = reducer.finalize(finalShard);
@@ -305,7 +305,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
 
     Barrier barrier(numblocks);
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-    for (Index i = 0; i < numblocks; ++i) {
+    for (unsigned int i = 0; i < numblocks; ++i) {
       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
                                   self, i * blocksize, blocksize, reducer,
                                   &shards[i]);
@@ -319,7 +319,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
     }
 
     barrier.Wait();
-    for (Index i = 0; i < numblocks; ++i) {
+    for (unsigned int i = 0; i < numblocks; ++i) {
       reducer.reduce(shards[i], &finalShard);
     }
     *output = reducer.finalize(finalShard);

From 2e4e4cb74de19de0997567a4d058d1522ec7e452 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 16:57:12 -0700
Subject: [PATCH 26/32] Use numext::abs instead of abs to avoid incorrect
 conversion to integer of the argument

---
 Eigen/src/Core/SpecialFunctions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index c12e41a7b..37ebb5915 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -576,7 +576,7 @@ struct igammac_impl {
       pkm1 = pk;
       qkm2 = qkm1;
       qkm1 = qk;
-      if (abs(pk) > big) {
+      if (numext::abs(pk) > big) {
         pkm2 *= biginv;
         pkm1 *= biginv;
         qkm2 *= biginv;

From 044efea965b484fcf13551c8edabdb62c4b4b462 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 23 Mar 2016 20:02:11 -0700
Subject: [PATCH 27/32] Made sure that the cxx11_tensor_cuda test can be
 compiled even without support for cxx11.

---
 unsupported/test/cxx11_tensor_cuda.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 1964d9e07..4d8465756 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -853,6 +853,10 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
   CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
 
+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
   CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
   CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
@@ -860,6 +864,7 @@ void test_cxx11_tensor_cuda()
 
   CALL_SUBTEST_4(test_cuda_digamma<float>());
 
+
   CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
   CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
   CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
@@ -894,4 +899,5 @@ void test_cxx11_tensor_cuda()
 
   CALL_SUBTEST_5(test_cuda_igamma<double>());
   CALL_SUBTEST_5(test_cuda_igammac<double>());
+#endif
 }

From 0968e925a040d4988f02e8476b5cea8518e5f966 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 24 Mar 2016 18:00:33 -0700
Subject: [PATCH 28/32] Updated the benchmarking code to use Eigen::half
 instead of half

---
 bench/tensors/tensor_benchmarks_fp16_gpu.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 49f75472a..35c6f7489 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -12,7 +12,7 @@
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N);                   \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
   }                                                                            \
@@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, D1, D2, D3);          \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
   }                                                                            \
@@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
     StopBenchmarkTiming();                                                     \
     Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
-    BenchmarkSuite<Eigen::GpuDevice, half> suite(device, N);                   \
+    BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters, DIM1, DIM2);                                             \
   }                                                                            \
@@ -73,4 +73,4 @@ BM_FuncWithKernelDimsGPU(convolution, 7, 4);
 BM_FuncWithKernelDimsGPU(convolution, 4, 7);
 BM_FuncWithKernelDimsGPU(convolution, 7, 64);
 BM_FuncWithKernelDimsGPU(convolution, 64, 7);
-*/
\ No newline at end of file
+*/

From a86c9f037b24312863ad2a74a583369581c6e21a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 24 Mar 2016 18:54:31 -0700
Subject: [PATCH 29/32] Fixed compilation error on windows

---
 unsupported/test/cxx11_tensor_notification.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index 813bc4413..c946007b8 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -9,11 +9,20 @@
 
 #define EIGEN_USE_THREADS
 
-#include <unistd.h>
 #include <stdlib.h>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+  Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
+
 namespace {
 
 void WaitAndAdd(Eigen::Notification* n, int* counter) {

From d94f6ba9659f8c953caaff854552070ce149958b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 25 Mar 2016 11:02:56 -0700
Subject: [PATCH 30/32] Started to model the cost of divisions more accurately.

---
 Eigen/src/Core/NumTraits.h               | 17 +++++++++++++++++
 Eigen/src/Core/functors/BinaryFunctors.h | 12 +++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 7ddb4a867..b7b5e7d22 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -60,6 +60,23 @@ template<typename T> struct GenericNumTraits
     MulCost = 1
   };
 
+  // Division is messy but important, because it is expensive and throughput
+  // varies significantly. The following numbers are based on min division
+  // throughput on Haswell.
+  template<bool Vectorized>
+  struct Div {
+    enum {
+#ifdef EIGEN_VECTORIZE_AVX
+      AVX = true,
+#else
+      AVX = false,
+#endif
+      Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)):
+          Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8
+    };
+  };
+
+
   typedef T Real;
   typedef typename internal::conditional<
                      IsInteger,
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 5cdfff845..d04323bb0 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -238,7 +238,13 @@ template<typename Scalar> struct scalar_hypot_op {
 };
 template<typename Scalar>
 struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+  enum
+  {
+    Cost = 3 * NumTraits<Scalar>::AddCost +
+           2 * NumTraits<Scalar>::MulCost +
+           2 * NumTraits<Scalar>::template Div<false>::Cost,
+    PacketAccess = false
+  };
 };
 
 /** \internal
@@ -564,6 +570,10 @@ struct scalar_inverse_mult_op {
   { return internal::pdiv(pset1<Packet>(m_other),a); }
   Scalar m_other;
 };
+template<typename Scalar>
+struct functor_traits<scalar_inverse_mult_op<Scalar> >
+{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; };
+
 
 } // end namespace internal
 

From 65716e99a5763f536257eb1dd047f34f8172f816 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 25 Mar 2016 11:13:53 -0700
Subject: [PATCH 31/32] Improved the cost estimate of the quotient op

---
 Eigen/src/Core/functors/BinaryFunctors.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index d04323bb0..e28fecfd0 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -303,9 +303,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
   enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
+    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable,
+    Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost
   };
 };
 

From 74f91ed06c615fc7d875bd30cb72ea5e08504be2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 25 Mar 2016 17:21:56 -0700
Subject: [PATCH 32/32] Improved support for integer modulo

---
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h       | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index c71a30d21..eb0c8d1ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,20 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
 
 
 /** \internal