Merged eigen/eigen into default

2025-09-22 22:33:15 +08:00 · 2016-12-14 17:32:16 -08:00 · 2016-12-14 17:32:16 -08:00 · 9ff5d0f821
commit 9ff5d0f821
parent 730eb9fe1c 11b492e993
24 changed files with 231 additions and 95 deletions
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@ -45,6 +45,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 public:
  typedef typename XprType::StorageIndex StorageIndex;
  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
  typedef typename internal::ref_selector<Inverse>::type Nested;
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -531,22 +531,24 @@ class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBas
  enum { IsRowMajor = unary_evaluator::IsRowMajor };
  const unary_evaluator& m_eval;
  Index m_outerPos;
-  Index m_innerIndex;
-  Scalar m_value;
+  const Index m_innerIndex;
  Index m_end;
+  EvalIterator m_it;
 public:

  EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
    : m_eval(aEval),
-      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
+      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
-      m_value(0),
-      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_it(m_eval.m_argImpl, m_outerPos)
  {
    EIGEN_UNUSED_VARIABLE(outer);
    eigen_assert(outer==0);

-    ++(*this);
+    while(m_it && m_it.index() < m_innerIndex) ++m_it;
+    if((!m_it) || (m_it.index()!=m_innerIndex))
+      ++(*this);
  }

  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
@ -554,21 +556,20 @@ public:
  inline Index row()    const { return IsRowMajor ? 0 : index(); }
  inline Index col()    const { return IsRowMajor ? index() : 0; }

-  inline Scalar value() const { return m_value; }
+  inline Scalar value() const { return m_it.value(); }
+  inline Scalar& valueRef() { return m_it.valueRef(); }

  inline OuterVectorInnerIterator& operator++()
  {
    // search next non-zero entry
    while(++m_outerPos<m_end)
    {
-      EvalIterator it(m_eval.m_argImpl, m_outerPos);
+      // Restart iterator at the next inner-vector:
+      m_it.~EvalIterator();
+      ::new (&m_it) EvalIterator(m_eval.m_argImpl, m_outerPos);
      // search for the key m_innerIndex in the current outer-vector
-      while(it && it.index() < m_innerIndex) ++it;
-      if(it && it.index()==m_innerIndex)
-      {
-        m_value = it.value();
-        break;
-      }
+      while(m_it && m_it.index() < m_innerIndex) ++m_it;
+      if(m_it && m_it.index()==m_innerIndex) break;
    }
    return *this;
  }
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@ -621,6 +621,22 @@ protected:
 * Implementation of SparseMatrixBase and SparseCwise functions/operators
 ***************************************************************************/

+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@ -123,8 +123,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator*=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
      i.valueRef() *= other;
  return derived();
 }
@ -133,8 +135,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator/=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
      i.valueRef() /= other;
  return derived();
 }
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@ -265,6 +265,11 @@ template<typename Derived> class SparseMatrixBase
    template<typename OtherDerived>
    Derived& operator-=(const DiagonalBase<OtherDerived>& other);

+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
+
    Derived& operator*=(const Scalar& other);
    Derived& operator/=(const Scalar& other);

--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@ -222,14 +222,43 @@ template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
 {
  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;
+
  template<typename DestScalar,int StorageOrder>
-  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
  }

+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced to:
+  template<typename DestScalar,int StorageOrder,typename AssignFunc>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
+  
  template<typename DestScalar>
-  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
  {
    // TODO directly evaluate into dst;
    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
--- a/doc/FixedSizeVectorizable.dox
+++ b/doc/FixedSizeVectorizable.dox
@ -1,6 +1,6 @@
 namespace Eigen {

-/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable Eigen objects
+/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable %Eigen objects

 The goal of this page is to explain what we mean by "fixed-size vectorizable".

@ -23,15 +23,15 @@ Examples include:

 \section FixedSizeVectorizable_explanation Explanation

-First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
+First, "fixed-size" should be clear: an %Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example \ref Matrix3f has fixed size, but \ref MatrixXf doesn't (the opposite of fixed-size is dynamic-size).

-The array of coefficients of a fixed-size Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a Matrix4f is just a "float array[16]".
+The array of coefficients of a fixed-size %Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a \ref Matrix4f is just a "float array[16]".

 Fixed-size objects are typically very small, which means that we want to handle them with zero runtime overhead -- both in terms of memory usage and of speed.

-Now, vectorization (both SSE and AltiVec) works with 128-bit packets. Moreover, for performance reasons, these packets need to be have 128-bit alignment.
+Now, vectorization works with 128-bit packets (e.g., SSE, AltiVec, NEON), 256-bit packets (e.g., AVX), or 512-bit packets (e.g., AVX512). Moreover, for performance reasons, these packets are most efficiently read and written if they have the same alignment as the packet size, that is 16 bytes, 32 bytes, and 64 bytes respectively.

-So it turns out that the only way that fixed-size Eigen objects can be vectorized, is if their size is a multiple of 128 bits, or 16 bytes. Eigen will then request 16-byte alignment for these objects, and henceforth rely on these objects being aligned so no runtime check for alignment is performed.
+So it turns out that the best way that fixed-size %Eigen objects can be vectorized, is if their size is a multiple of 16 bytes (or more). %Eigen will then request 16-byte alignment (or more) for these objects, and henceforth rely on these objects being aligned to achieve maximal efficiency.

 */

--- a/doc/PassingByValue.dox
+++ b/doc/PassingByValue.dox
@ -4,21 +4,21 @@ namespace Eigen {

 Passing objects by value is almost always a very bad idea in C++, as this means useless copies, and one should pass them by reference instead.

-With Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these Eigen objects have alignment modifiers that aren't respected when they are passed by value.
+With %Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.

-So for example, a function like this, where v is passed by value:
+For example, a function like this, where \c v is passed by value:

 \code
 void my_function(Eigen::Vector2d v);
 \endcode

-needs to be rewritten as follows, passing v by reference:
+needs to be rewritten as follows, passing \c v by const reference:

 \code
 void my_function(const Eigen::Vector2d& v);
 \endcode

-Likewise if you have a class having a Eigen object as member:
+Likewise if you have a class having an %Eigen object as member:

 \code
 struct Foo
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@ -115,6 +115,15 @@ If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable
 It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
 It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.

+\section checkmycode How can I check my code is safe regarding alignment issues?
+
+Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them).
+Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page.
+Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default.
+
+The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink.
+
+
 */

 }
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@ -109,6 +109,13 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
    rm.col(i).swap(rm.col(j));
    VERIFY_IS_APPROX(rm, rp2.toDenseMatrix().template cast<Scalar>());
  }
+
+  {
+    // simple compilation check
+    Matrix<Scalar, Cols, Cols> A = rp;
+    Matrix<Scalar, Cols, Cols> B = rp.transpose();
+    VERIFY_IS_APPROX(A, B.transpose());
+  }
 }

 template<typename T>
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@ -431,6 +431,14 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
    m3 = m2.template selfadjointView<Lower>();
    VERIFY_IS_APPROX(m3, refMat3);

+    refMat3 += refMat2.template selfadjointView<Lower>();
+    m3 += m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 -= refMat2.template selfadjointView<Lower>();
+    m3 -= m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
    // selfadjointView only works for square matrices:
    SparseMatrixType m4(rows, rows+1);
    VERIFY_RAISES_ASSERT(m4.template selfadjointView<Lower>());
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@ -9,6 +9,20 @@

 #include "sparse.h"

+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
+innervec(T& A, Index i)
+{
+  return A.row(i);
+}
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==0, typename T::ColXpr>::type
+innervec(T& A, Index i)
+{
+  return A.col(i);
+}
+
 template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
 {
  const Index rows = ref.rows();
@ -20,9 +34,10 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
  typedef typename SparseMatrixType::StorageIndex StorageIndex;

  double density = (std::max)(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,Dynamic,SparseMatrixType::IsRowMajor?RowMajor:ColMajor> DenseMatrix;
  typedef Matrix<Scalar,Dynamic,1> DenseVector;
  typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar> SparseVectorType;

  Scalar s1 = internal::random<Scalar>();
  {
@ -110,15 +125,35 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
    initSparse<Scalar>(density, refMat2, m2);
    Index j0 = internal::random<Index>(0,outer-1);
    Index j1 = internal::random<Index>(0,outer-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
+    Index r0 = internal::random<Index>(0,rows-1);
+    Index c0 = internal::random<Index>(0,cols-1);

-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
+    VERIFY_IS_APPROX(m2.innerVector(j0), innervec(refMat2,j0));
+    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), innervec(refMat2,j0)+innervec(refMat2,j1));
+
+    m2.innerVector(j0) *= Scalar(2);
+    innervec(refMat2,j0) *= Scalar(2);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) *= Scalar(3);
+    refMat2.row(r0) *= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) *= Scalar(4);
+    refMat2.col(c0) *= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) /= Scalar(3);
+    refMat2.row(r0) /= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) /= Scalar(4);
+    refMat2.col(c0) /= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    SparseVectorType v1;
+    VERIFY_IS_APPROX(v1 = m2.col(c0) * 4, refMat2.col(c0)*4);
+    VERIFY_IS_APPROX(v1 = m2.row(r0) * 4, refMat2.row(r0).transpose()*4);

    SparseMatrixType m3(rows,cols);
    m3.reserve(VectorXi::Constant(outer,int(inner/2)));
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -53,8 +53,10 @@ typedef __int32 int32_t;
 typedef unsigned __int32 uint32_t;
 typedef __int64 int64_t;
 typedef unsigned __int64 uint64_t;
+#include <windows.h>
 #else
 #include <stdint.h>
+#include <unistd.h>
 #endif

 #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@ -58,6 +58,18 @@
 #include "src/ThreadPool/SimpleThreadPool.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"

+
+// Use the more efficient NonBlockingThreadPool by default.
+namespace Eigen {
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+}  // namespace Eigen
+
 #endif

 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -418,7 +418,7 @@ class TensorContractionSubMapper {
    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
  }

-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
    if (UseDirectOffsets) {
      m_base_mapper.storePacket(i, 0, p);
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -88,11 +88,7 @@ static void initializeDeviceProp() {
 #if __cplusplus >= 201103L
        std::atomic_thread_fence(std::memory_order_acquire);
 #endif
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-        Sleep(1000);
-#else
-        sleep(1);
-#endif
+        EIGEN_SLEEP(1000);
      }
    }
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -12,17 +12,6 @@

 namespace Eigen {

-// Use the SimpleThreadPool by default. We'll switch to the new non blocking
-// thread pool later.
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-
-
 // Barrier is an object that allows one or more threads to wait until
 // Notify has been called a specified number of times.
 class Barrier {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@ -51,4 +51,12 @@
 #endif


+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#define EIGEN_SLEEP(n) Sleep(n)
+#elif EIGEN_OS_GNULINUX
+#define EIGEN_SLEEP(n) usleep(n * 1000);
+#else
+#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
+#endif
+
 #endif
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@ -28,6 +28,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
        blocked_(0),
        spinning_(0),
        done_(false),
+        cancelled_(false),
        ec_(waiters_) {
    waiters_.resize(num_threads);

@ -61,10 +62,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {

  ~NonBlockingThreadPoolTempl() {
    done_ = true;
+
    // Now if all threads block without work, they will start exiting.
    // But note that threads can continue to work arbitrary long,
    // block, submit new work, unblock and otherwise live full life.
-    ec_.Notify(true);
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < queues_.size(); i++) {
+        queues_[i]->Flush();
+      }
+    }

    // Join threads explicitly to avoid destruction order issues.
    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
@ -91,16 +101,25 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
    // completes overall computations, which in turn leads to destruction of
    // this. We expect that such scenario is prevented by program, that is,
    // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f)
+    if (!t.f) {
      ec_.Notify(false);
-    else
+    }
+    else {
      env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
  }

  void Cancel() {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->Cancel();
+      threads_[i]->OnCancel();
    }
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
  }

  int NumThreads() const final {
@ -135,6 +154,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
  std::atomic<unsigned> blocked_;
  std::atomic<bool> spinning_;
  std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
  EventCount ec_;

  // Main worker thread loop.
@ -145,7 +165,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
    pt->thread_id = thread_id;
    Queue* q = queues_[thread_id];
    EventCount::Waiter* waiter = &waiters_[thread_id];
-    for (;;) {
+    while (!cancelled_) {
      Task t = q->PopFront();
      if (!t.f) {
        t = Steal();
@ -158,7 +178,11 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
          // pool. Consider a time based limit instead.
          if (!spinning_ && !spinning_.exchange(true)) {
            for (int i = 0; i < 1000 && !t.f; i++) {
-              t = Steal();
+              if (!cancelled_.load(std::memory_order_relaxed)) {
+                t = Steal();
+              } else {
+                return;
+              }
            }
            spinning_ = false;
          }
@ -207,8 +231,12 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
    int victim = NonEmptyQueueIndex();
    if (victim != -1) {
      ec_.CancelWait(waiter);
-      *t = queues_[victim]->PopBack();
-      return true;
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = queues_[victim]->PopBack();
+        return true;
+      }
    }
    // Number of blocked threads is used as termination condition.
    // If we are shutting down and all worker threads blocked without work,
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@ -71,7 +71,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {

  void Cancel() {
    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->Cancel();
+      threads_[i]->OnCancel();
    }
  }

--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@ -23,7 +23,8 @@ struct StlThreadEnvironment {
   public:
    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
    ~EnvThread() { thr_.join(); }
-    void Cancel() { EIGEN_THREAD_CANCEL(thr_); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }

   private:
    std::thread thr_;
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@ -19,8 +19,10 @@ class ThreadPoolInterface {
  // Submits a closure to be run by a thread in the pool.
  virtual void Schedule(std::function<void()> fn) = 0;

-  // Cancel all the threads in the pool.
-  virtual void Cancel() = 0;
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}

  // Returns the number of threads in the pool.
  virtual int NumThreads() const = 0;
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@ -10,8 +10,8 @@

 #define EIGEN_USE_THREADS
 #include "main.h"
-#include <unistd.h>
 #include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"

 static void test_create_destroy_empty_pool()
 {
@ -104,23 +104,15 @@ static void test_parallelism()

 static void test_cancel()
 {
-  NonBlockingThreadPool tp(4);
+  NonBlockingThreadPool tp(2);

-#ifdef EIGEN_SUPPORTS_THREAD_CANCELLATION
-  std::cout << "Thread cancellation is supported on this platform" << std::endl;
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() { EIGEN_SLEEP(2000); });
+  }

-  // Put 2 threads to sleep for much longer than the default test timeout.
-  tp.Schedule([]() { sleep(3600); } );
-  tp.Schedule([]() { sleep(3600 * 24); } );
-#else
-  std::cout << "Thread cancellation is a no-op on this platform" << std::endl;
-
-  // Make 2 threads sleep for a short period of time
-  tp.Schedule([]() { sleep(1); } );
-  tp.Schedule([]() { sleep(2); } );
-#endif
-
-  // Call cancel:
+  // Cancel the processing of all the closures that are still pending.
  tp.Cancel();
 }

--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@ -13,15 +13,6 @@
 #include "main.h"
 #include <Eigen/CXX11/Tensor>

-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
-  Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-

 namespace {

@ -40,7 +31,7 @@ static void test_notification_single()
  Eigen::Notification n;
  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
  thread_pool.Schedule(func);
-  sleep(1);
+  EIGEN_SLEEP(1000);

  // The thread should be waiting for the notification.
  VERIFY_IS_EQUAL(counter, 0);
@ -48,7 +39,7 @@ static void test_notification_single()
  // Unblock the thread
  n.Notify();

-  sleep(1);
+  EIGEN_SLEEP(1000);

  // Verify the counter has been incremented
  VERIFY_IS_EQUAL(counter, 1);
@ -67,10 +58,10 @@ static void test_notification_multiple()
  thread_pool.Schedule(func);
  thread_pool.Schedule(func);
  thread_pool.Schedule(func);
-  sleep(1);
+  EIGEN_SLEEP(1000);
  VERIFY_IS_EQUAL(counter, 0);
  n.Notify();
-  sleep(1);
+  EIGEN_SLEEP(1000);
  VERIFY_IS_EQUAL(counter, 4);
 }