From 1c8347e554a7bbdcf6dbf364367659d54844b30e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 14 Sep 2016 18:28:49 +0200
Subject: [PATCH 01/59] Fix product for custom complex type. (conjugation was
 ignored)

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c66882012..873f0a20c 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -434,15 +434,16 @@ public:
   template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
   EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
   {
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
     // let gcc allocate the register in which to store the result of the pmul
     // (in the case where there is no FMA) gcc fails to figure out how to avoid
     // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
   }
 
@@ -457,9 +458,6 @@ public:
     r = pmadd(c,alpha,r);
   }
 
-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 
 template<typename RealScalar, bool _ConjLhs>

From 779faaaeba8b4d6fa9b2cc62906cccb0be3edf03 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 14 Sep 2016 09:56:11 -0700
Subject: [PATCH 02/59] Fixed compilation warnings generated by nvcc 6.5 (and
 below) when compiling the EIGEN_THROW macro

---
 Eigen/src/Core/util/Macros.h | 4 ++--
 Eigen/src/Core/util/Memory.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index a9db2f4c7..c1049f7b2 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -954,8 +954,8 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) catch (X)
 #else
 #  ifdef __CUDA_ARCH__
-#    define EIGEN_THROW_X(X) asm("trap;") return {}
-#    define EIGEN_THROW asm("trap;"); return {}
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 8601c8321..0439655ca 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
     destruct_elements_of_array(ptr, i);
     EIGEN_THROW;
   }
+  return NULL;
 }
 
 /*****************************************************************************
@@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
     aligned_free(result);
     EIGEN_THROW;
   }
+  return result;
 }
 
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
@@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
     conditional_aligned_free<Align>(result);
     EIGEN_THROW;
   }
+  return result;
 }
 
 /** \internal Deletes objects constructed with aligned_new

From 488ad7dd1b2806ad57f435ebb2461b6ab57443b9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 14 Sep 2016 13:35:00 -0700
Subject: [PATCH 03/59] Added missing EIGEN_DEVICE_FUNC qualifiers

---
 unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index d66e45d50..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -51,12 +51,15 @@ class TensorOpCost {
         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
   }
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(compute_cycles) {}
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
                bool vectorized, double packet_size)
       : bytes_loaded_(bytes_loaded),

From c0d56a543e170d221e58b177b245c0e57ce1993a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 14 Sep 2016 14:06:21 -0700
Subject: [PATCH 04/59] Added several missing EIGEN_DEVICE_FUNC qualifiers

---
 Eigen/src/Core/Inverse.h       | 2 +-
 Eigen/src/Core/MathFunctions.h | 3 ++-
 Eigen/src/Core/MatrixBase.h    | 2 +-
 Eigen/src/LU/FullPivLU.h       | 8 ++++----
 Eigen/src/LU/InverseImpl.h     | 2 +-
 Eigen/src/LU/PartialPivLU.h    | 4 ++--
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index f3ec84990..f303aebf9 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -50,7 +50,7 @@ public:
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef typename internal::remove_all<XprType>::type NestedExpression;
   
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
     : m_xpr(xpr)
   {}
 
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index bf3044b96..fa322aca7 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1312,11 +1312,12 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
     return numext::abs2(x) <= numext::abs2(y) * prec * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
     return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 334a4d71e..976b130e3 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -98,7 +98,7 @@ template<typename Derived> class MatrixBase
     /** \returns the size of the main diagonal, which is min(rows(),cols()).
       * \sa rows(), cols(), SizeAtCompileTime. */
     EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
 
     typedef typename Base::PlainObject PlainObject;
 
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 2b30fc146..ebcd5c208 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -156,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa permutationQ()
       */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       return m_p;
@@ -406,8 +406,8 @@ template<typename _MatrixType> class FullPivLU
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
@@ -879,7 +879,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
   *
   * \sa class FullPivLU
   */
-template<typename Derived>
+template<typename Derived> EIGEN_DEVICE_FUNC
 inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::fullPivLu() const
 {
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 3134632e1..147f9496c 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -327,7 +327,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   *
   * \sa computeInverseAndDetWithCheck()
   */
-template<typename Derived>
+template<typename Derived> EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index d43961887..13394fffa 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -584,7 +584,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
   *
   * \sa class PartialPivLU
   */
-template<typename Derived>
+template<typename Derived> EIGEN_DEVICE_FUNC
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::partialPivLu() const
 {
@@ -599,7 +599,7 @@ MatrixBase<Derived>::partialPivLu() const
   *
   * \sa class PartialPivLU
   */
-template<typename Derived>
+template<typename Derived> EIGEN_DEVICE_FUNC
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::lu() const
 {

From fa9049a544f06d6f34782e0ba281cce44325c174 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 15 Sep 2016 11:24:03 +0200
Subject: [PATCH 05/59] Let be consistent and consider any denormal number as
 zero.

---
 Eigen/src/SVD/JacobiSVD.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 78dfd1d59..ea2bd62eb 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -665,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   // only worsening the precision of U and V as we accumulate more rotations
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  // FIXME What about considerering any denormal numbers as zero, using:
-  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().maxCoeff();

From 50e203c71723fe534a428f7d9b813365cad97d53 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Sep 2016 10:40:50 +0200
Subject: [PATCH 06/59] bug #828: clarify documentation of SparseMatrixBase's
 unary methods.

---
 Eigen/src/Core/ArrayBase.h              |   2 +
 Eigen/src/Core/MatrixBase.h             |   2 +
 Eigen/src/SparseCore/SparseMatrixBase.h |   7 +-
 Eigen/src/plugins/CommonCwiseUnaryOps.h | 126 ++++++++++++++----------
 Eigen/src/plugins/MatrixCwiseUnaryOps.h |  80 ++++++++-------
 doc/Doxyfile.in                         |   3 +-
 6 files changed, 130 insertions(+), 90 deletions(-)

diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 3a66f0e40..f0232f65e 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -87,6 +87,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@@ -97,6 +98,7 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 976b130e3..19a7483ba 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -121,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -129,6 +130,7 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 96b1b0504..2e639aa6f 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -141,6 +141,11 @@ template<typename Derived> class SparseMatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)  /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@@ -149,8 +154,8 @@ template<typename Derived> class SparseMatrixBase
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
 
     /** \returns the number of rows. \sa cols() */
     inline Index rows() const { return derived().rows(); }
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 5719c6b10..89f4faaac 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -36,8 +36,10 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati
 
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns an expression of the opposite of \c *this
-  */
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
+///
 EIGEN_DEVICE_FUNC
 inline const NegativeReturnType
 operator-() const { return NegativeReturnType(derived()); }
@@ -45,13 +47,15 @@ operator-() const { return NegativeReturnType(derived()); }
 
 template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
 
-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
 template<typename NewType>
 EIGEN_DEVICE_FUNC
 typename CastXpr<NewType>::Type
@@ -60,9 +64,11 @@ cast() const
   return typename CastXpr<NewType>::Type(derived());
 }
 
-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
@@ -70,39 +76,45 @@ conjugate() const
   return ConjugateReturnType(derived());
 }
 
-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline RealReturnType
 real() const { return RealReturnType(derived()); }
 
-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline const ImagReturnType
 imag() const { return ImagReturnType(derived()); }
 
-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
 template<typename CustomUnaryOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@@ -111,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
   return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
 
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
 template<typename CustomViewOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
@@ -130,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
   return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
 
-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
 real() { return NonConstRealReturnType(derived()); }
 
-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
 imag() { return NonConstImagReturnType(derived()); }
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index e16bb374b..b1be3d566 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -11,63 +11,75 @@
 // This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
 // This include MatrixBase and SparseMatrixBase.
 
+
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
+///
+/// \sa cwiseAbs2()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbsReturnType
 cwiseAbs() const { return CwiseAbsReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
+///
+/// \sa cwiseAbs()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
 cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
+///
+/// \sa cwisePow(), cwiseSquare()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise signum of *this.
-  *
-  * Example: \include MatrixBase_cwiseSign.cpp
-  * Output: \verbinclude MatrixBase_cwiseSign.out
-  *
-  */
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSignReturnType
 cwiseSign() const { return CwiseSignReturnType(derived()); }
 
 
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
+///
+/// \sa cwiseProduct()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
+
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 6f8d6bc01..37fd2740e 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1612,7 +1612,8 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                          EIGEN_EMPTY \
                          EIGEN_EULER_ANGLES_TYPEDEFS \
                          EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
-                         EIGEN_EULER_SYSTEM_TYPEDEF
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
+                         EIGEN_DOC_UNARY_ADDONS
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros

From ca7f061a5fe6200512da83fbc07cfef39bb85a96 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Sep 2016 11:23:19 +0200
Subject: [PATCH 07/59] bug #828: clarify documentation of SparseMatrixBase's
 methods returning a sub-matrix.

---
 Eigen/src/Core/DenseBase.h              |    5 +-
 Eigen/src/SparseCore/SparseMatrixBase.h |    8 +-
 Eigen/src/plugins/BlockMethods.h        | 1016 ++++++++++++-----------
 doc/Doxyfile.in                         |    4 +-
 4 files changed, 551 insertions(+), 482 deletions(-)

diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 0ede9b041..c110bbf11 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -558,12 +558,15 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC void reverseInPlace();
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     // disable the use of evalTo for dense objects with a nice compilation error
     template<typename Dest>
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 2e639aa6f..966d401a3 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -142,9 +142,13 @@ template<typename Derived> class SparseMatrixBase
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)  /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. </p> */
 #else
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
@@ -156,6 +160,8 @@ template<typename Derived> class SparseMatrixBase
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
     /** \returns the number of rows. \sa cols() */
     inline Index rows() const { return derived().rows(); }
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 632094e15..b76973613 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -10,28 +10,28 @@
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-/** \internal expression type of a column */
+/// \internal expression type of a column */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
-/** \internal expression type of a row */
+/// \internal expression type of a row */
 typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
 typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
 typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
 typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
-/** \internal expression type of a block of whole columns */
+/// \internal expression type of a block of whole columns */
 template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
 template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-/** \internal expression type of a block of whole rows */
+/// \internal expression type of a block of whole rows */
 template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
 template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
-/** \internal expression of a block */
+/// \internal expression of a block */
 typedef Block<Derived> BlockXpr;
 typedef const Block<const Derived> ConstBlockXpr;
-/** \internal expression of a block of fixed sizes */
+/// \internal expression of a block of fixed sizes */
 template<int Rows, int Cols> struct FixedBlockXpr { typedef Block<Derived,Rows,Cols> Type; };
 template<int Rows, int Cols> struct ConstFixedBlockXpr { typedef Block<const Derived,Rows,Cols> Type; };
 
@@ -42,29 +42,31 @@ template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBloc
 
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/** \returns a dynamic-size expression of a block in *this.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  * \param blockRows the number of rows in the block
-  * \param blockCols the number of columns in the block
-  *
-  * Example: \include MatrixBase_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of a block in *this.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+/// \param blockRows the number of rows in the block
+/// \param blockCols the number of columns in the block
+///
+/// Example: \include MatrixBase_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
 {
   return BlockXpr(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block(Index,Index,Index,Index). */
+/// This is the const version of block(Index,Index,Index,Index). */
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
 {
@@ -74,39 +76,43 @@ inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows
 
 
 
-/** \returns a dynamic-size expression of a top-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a top-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr topRightCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner(Index, Index).*/
+/// This is the const version of topRightCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size top-right corner of *this.
-  *
-  * \tparam CRows the number of rows in the corner
-  * \tparam CCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
-  *
-  * \sa class Block, block<int,int>(Index,Index)
-  */
+/// \returns an expression of a fixed-size top-right corner of *this.
+///
+/// \tparam CRows the number of rows in the corner
+/// \tparam CCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block<int,int>(Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
@@ -114,7 +120,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** This is the const version of topRightCorner<int, int>().*/
+/// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
@@ -122,30 +128,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() con
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/** \returns an expression of a top-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of topRightCorner<int, int>(Index, Index).*/
+/// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
@@ -154,38 +162,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index
 
 
 
-/** \returns a dynamic-size expression of a top-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a top-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr topLeftCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner(Index, Index).*/
+/// This is the const version of topLeftCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size top-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size top-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
@@ -193,7 +205,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** This is the const version of topLeftCorner<int, int>().*/
+/// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
@@ -201,30 +213,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() cons
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/** \returns an expression of a top-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a top-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
-/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
@@ -233,38 +247,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index
 
 
 
-/** \returns a dynamic-size expression of a bottom-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a bottom-right corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr bottomRightCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner(Index, Index).*/
+/// This is the const version of bottomRightCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-right corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-right corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
@@ -272,7 +290,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>().*/
+/// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
@@ -280,30 +298,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/** \returns an expression of a bottom-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-right corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
@@ -312,38 +332,42 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(In
 
 
 
-/** \returns a dynamic-size expression of a bottom-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a dynamic-size expression of a bottom-left corner of *this.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline BlockXpr bottomLeftCorner(Index cRows, Index cCols)
 {
   return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner(Index, Index).*/
+/// This is the const version of bottomLeftCorner(Index, Index).
 EIGEN_DEVICE_FUNC
 inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
 {
   return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** \returns an expression of a fixed-size bottom-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a fixed-size bottom-left corner of *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
@@ -351,7 +375,7 @@ inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>().*/
+/// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
@@ -359,30 +383,32 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() c
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/** \returns an expression of a bottom-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
+/// \returns an expression of a bottom-left corner of *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
 template<int CRows, int CCols>
 inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
+/// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
 inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
@@ -391,41 +417,45 @@ inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Ind
 
 
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_topRows_int.cpp
-  * Output: \verbinclude MatrixBase_topRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_topRows_int.cpp
+/// Output: \verbinclude MatrixBase_topRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr topRows(Index n)
 {
   return RowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows(Index).*/
+/// This is the const version of topRows(Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr topRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_topRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_topRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the top rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_topRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_topRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
@@ -433,7 +463,7 @@ inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
-/** This is the const version of topRows<int>().*/
+/// This is the const version of topRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
@@ -443,41 +473,45 @@ inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 
 
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_bottomRows_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \param n the number of rows in the block
+///
+/// Example: \include MatrixBase_bottomRows_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr bottomRows(Index n)
 {
   return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows(Index).*/
+/// This is the const version of bottomRows(Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr bottomRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_bottomRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the bottom rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_bottomRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_bottomRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
@@ -485,7 +519,7 @@ inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
-/** This is the const version of bottomRows<int>().*/
+/// This is the const version of bottomRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
@@ -495,43 +529,47 @@ inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block
-  *
-  * Example: \include DenseBase_middleRows_int.cpp
-  * Output: \verbinclude DenseBase_middleRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block
+///
+/// Example: \include DenseBase_middleRows_int.cpp
+/// Output: \verbinclude DenseBase_middleRows_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline RowsBlockXpr middleRows(Index startRow, Index n)
 {
   return RowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows(Index,Index).*/
+/// This is the const version of middleRows(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 {
   return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleRows.cpp
-  * Output: \verbinclude DenseBase_template_int_middleRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of rows of *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleRows.cpp
+/// Output: \verbinclude DenseBase_template_int_middleRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
@@ -539,7 +577,7 @@ inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
-/** This is the const version of middleRows<int>().*/
+/// This is the const version of middleRows<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
@@ -549,41 +587,45 @@ inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n =
 
 
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_leftCols_int.cpp
-  * Output: \verbinclude MatrixBase_leftCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_leftCols_int.cpp
+/// Output: \verbinclude MatrixBase_leftCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr leftCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols(Index).*/
+/// This is the const version of leftCols(Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr leftCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_leftCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_leftCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the left columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_leftCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_leftCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
@@ -591,7 +633,7 @@ inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
-/** This is the const version of leftCols<int>().*/
+/// This is the const version of leftCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
@@ -601,41 +643,45 @@ inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 
 
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_rightCols_int.cpp
-  * Output: \verbinclude MatrixBase_rightCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \param n the number of columns in the block
+///
+/// Example: \include MatrixBase_rightCols_int.cpp
+/// Output: \verbinclude MatrixBase_rightCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr rightCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols(Index).*/
+/// This is the const version of rightCols(Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr rightCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_rightCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_rightCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of the right columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_rightCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_rightCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
@@ -643,7 +689,7 @@ inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
-/** This is the const version of rightCols<int>().*/
+/// This is the const version of rightCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
@@ -653,43 +699,47 @@ inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 
 
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \param startCol the index of the first column in the block
-  * \param numCols the number of columns in the block
-  *
-  * Example: \include DenseBase_middleCols_int.cpp
-  * Output: \verbinclude DenseBase_middleCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \param startCol the index of the first column in the block
+/// \param numCols the number of columns in the block
+///
+/// Example: \include DenseBase_middleCols_int.cpp
+/// Output: \verbinclude DenseBase_middleCols_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline ColsBlockXpr middleCols(Index startCol, Index numCols)
 {
   return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** This is the const version of middleCols(Index,Index).*/
+/// This is the const version of middleCols(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 {
   return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param startCol the index of the first column in the block
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleCols.cpp
-  * Output: \verbinclude DenseBase_template_int_middleCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a block consisting of a range of columns of *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param startCol the index of the first column in the block
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleCols.cpp
+/// Output: \verbinclude DenseBase_template_int_middleCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
@@ -697,7 +747,7 @@ inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
-/** This is the const version of middleCols<int>().*/
+/// This is the const version of middleCols<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
@@ -707,22 +757,24 @@ inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n =
 
 
 
-/** \returns a fixed-size expression of a block in *this.
-  *
-  * The template parameters \a NRows and \a NCols are the number of
-  * rows and columns in the block.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  *
-  * Example: \include MatrixBase_block_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int.out
-  *
-  * \note since block is a templated member, the keyword template has to be used
-  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns a fixed-size expression of a block in *this.
+///
+/// The template parameters \a NRows and \a NCols are the number of
+/// rows and columns in the block.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+///
+/// Example: \include MatrixBase_block_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int.out
+///
+/// \note since block is a templated member, the keyword template has to be used
+/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int NRows, int NCols>
 EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
@@ -730,7 +782,7 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** This is the const version of block<>(Index, Index). */
+/// This is the const version of block<>(Index, Index). */
 template<int NRows, int NCols>
 EIGEN_DEVICE_FUNC
 inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
@@ -738,25 +790,27 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/** \returns an expression of a block in *this.
-  *
-  * \tparam NRows number of rows in block as specified at compile-time
-  * \tparam NCols number of columns in block as specified at compile-time
-  * \param  startRow  the first row in the block
-  * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run-time
-  * \param  blockCols number of columns in block as specified at run-time
-  *
-  * This function is mainly useful for blocks where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a blockRows should equal \a NRows unless
-  * \a NRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
+/// \returns an expression of a block in *this.
+///
+/// \tparam NRows number of rows in block as specified at compile-time
+/// \tparam NCols number of columns in block as specified at compile-time
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in block as specified at run-time
+/// \param  blockCols number of columns in block as specified at run-time
+///
+/// This function is mainly useful for blocks where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a blockRows should equal \a NRows unless
+/// \a NRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block(Index,Index,Index,Index)
+///
 template<int NRows, int NCols>
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
@@ -764,7 +818,7 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** This is the const version of block<>(Index, Index, Index, Index). */
+/// This is the const version of block<>(Index, Index, Index, Index). */
 template<int NRows, int NCols>
 inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
@@ -772,60 +826,64 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_col.cpp
-  * Output: \verbinclude MatrixBase_col.out
-  *
-  * \sa row(), class Block */
+/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_col.cpp
+/// Output: \verbinclude MatrixBase_col.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
+///
+/// \sa row(), class Block */
 EIGEN_DEVICE_FUNC
 inline ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
-/** This is the const version of col(). */
+/// This is the const version of col(). */
 EIGEN_DEVICE_FUNC
 inline ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_row.cpp
-  * Output: \verbinclude MatrixBase_row.out
-  *
-  * \sa col(), class Block */
+/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_row.cpp
+/// Output: \verbinclude MatrixBase_row.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
+///
+/// \sa col(), class Block */
 EIGEN_DEVICE_FUNC
 inline RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
-/** This is the const version of row(). */
+/// This is the const version of row(). */
 EIGEN_DEVICE_FUNC
 inline ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
-  *
-  * \only_for_vectors
-  *
-  * \param start the first coefficient in the segment
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_segment_int_int.cpp
-  * Output: \verbinclude MatrixBase_segment_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, segment(Index)
-  */
+/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
+///
+/// \only_for_vectors
+///
+/// \param start the first coefficient in the segment
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_segment_int_int.cpp
+/// Output: \verbinclude MatrixBase_segment_int_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, segment(Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType segment(Index start, Index n)
 {
@@ -834,7 +892,7 @@ inline SegmentReturnType segment(Index start, Index n)
 }
 
 
-/** This is the const version of segment(Index,Index).*/
+/// This is the const version of segment(Index,Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType segment(Index start, Index n) const
 {
@@ -842,21 +900,21 @@ inline ConstSegmentReturnType segment(Index start, Index n) const
   return ConstSegmentReturnType(derived(), start, n);
 }
 
-/** \returns a dynamic-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_start_int.cpp
-  * Output: \verbinclude MatrixBase_start_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_start_int.cpp
+/// Output: \verbinclude MatrixBase_start_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType head(Index n)
 {
@@ -864,7 +922,7 @@ inline SegmentReturnType head(Index n)
   return SegmentReturnType(derived(), 0, n);
 }
 
-/** This is the const version of head(Index).*/
+/// This is the const version of head(Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType head(Index n) const
 {
@@ -872,21 +930,21 @@ inline ConstSegmentReturnType head(Index n) const
   return ConstSegmentReturnType(derived(), 0, n);
 }
 
-/** \returns a dynamic-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_end_int.cpp
-  * Output: \verbinclude MatrixBase_end_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
+/// \returns a dynamic-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+///
+/// Example: \include MatrixBase_end_int.cpp
+/// Output: \verbinclude MatrixBase_end_int.out
+///
+/// \note Even though the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
 EIGEN_DEVICE_FUNC
 inline SegmentReturnType tail(Index n)
 {
@@ -894,7 +952,7 @@ inline SegmentReturnType tail(Index n)
   return SegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** This is the const version of tail(Index).*/
+/// This is the const version of tail(Index).
 EIGEN_DEVICE_FUNC
 inline ConstSegmentReturnType tail(Index n) const
 {
@@ -902,22 +960,22 @@ inline ConstSegmentReturnType tail(Index n) const
   return ConstSegmentReturnType(derived(), this->size() - n, n);
 }
 
-/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param start the index of the first element in the segment
-  * \param n the number of coefficients in the segment as specified at compile-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_segment.cpp
-  * Output: \verbinclude MatrixBase_template_int_segment.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param start the index of the first element in the segment
+/// \param n the number of coefficients in the segment as specified at compile-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_segment.cpp
+/// Output: \verbinclude MatrixBase_template_int_segment.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
@@ -926,7 +984,7 @@ inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** This is the const version of segment<int>(Index).*/
+/// This is the const version of segment<int>(Index).
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
@@ -935,21 +993,21 @@ inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/** \returns a fixed-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_start.cpp
-  * Output: \verbinclude MatrixBase_template_int_start.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the first coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_start.cpp
+/// Output: \verbinclude MatrixBase_template_int_start.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
@@ -958,7 +1016,7 @@ inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** This is the const version of head<int>().*/
+/// This is the const version of head<int>().
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
@@ -967,21 +1025,21 @@ inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/** \returns a fixed-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_end.cpp
-  * Output: \verbinclude MatrixBase_template_int_end.out
-  *
-  * \sa class Block
-  */
+/// \returns a fixed-size expression of the last coefficients of *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_end.cpp
+/// Output: \verbinclude MatrixBase_template_int_end.out
+///
+/// \sa class Block
+///
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
@@ -990,7 +1048,7 @@ inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
 
-/** This is the const version of tail<int>.*/
+/// This is the const version of tail<int>.
 template<int N>
 EIGEN_DEVICE_FUNC
 inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 37fd2740e..e9b116d28 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1613,7 +1613,9 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                          EIGEN_EULER_ANGLES_TYPEDEFS \
                          EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
                          EIGEN_EULER_SYSTEM_TYPEDEF \
-                         EIGEN_DOC_UNARY_ADDONS
+                         EIGEN_DOC_UNARY_ADDONS \
+                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
+                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros

From ee62f168e6be801214c732a1e7a048a388dede47 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Sep 2016 11:26:25 +0200
Subject: [PATCH 08/59] Doc: add link from block methods to respective tutorial
 section.

---
 Eigen/src/SparseCore/SparseMatrixBase.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 966d401a3..8816bcff4 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -143,8 +143,8 @@ template<typename Derived> class SparseMatrixBase
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
-#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. </p> */
-#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
 #else
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL

From 18f6e47815e2e771a0043d90ed52f853158c416c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Sep 2016 11:32:54 +0200
Subject: [PATCH 09/59] Fix order of "static inline".

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index fc75dbb5c..1ba8d6328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -190,25 +190,25 @@ struct reducer_traits<MeanReducer<T>, Device> {
 
 template <typename T, bool IsMax = true, bool IsInteger = true>
 struct MinMaxBottomValue {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::lowest();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, true, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return -Eigen::NumTraits<T>::infinity();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::highest();
   }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
     return Eigen::NumTraits<T>::infinity();
   }
 };

From 4adeababf90f318cd2181a945211bae83fdea930 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Sep 2016 11:46:46 +0200
Subject: [PATCH 10/59] Fix undeflow

---
 test/svd_fill.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/svd_fill.h b/test/svd_fill.h
index a705fa011..a1f752c66 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -7,6 +7,14 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+template<typename T>
+Array<T,4,1> four_denorms();
+
+template<>
+Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
+template<>
+Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+
 template<typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0)
 {
@@ -55,7 +63,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
   }
   
   Matrix<Scalar,Dynamic,1> samples(9);
-  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
+  samples << 0, four_denorms<RealScalar>(),
+            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
   
   if(Option==Symmetric)
   {

From 6edd2e2851ca9080a43afa5ee64031a92750efdd Mon Sep 17 00:00:00 2001
From: Emil Fresk <emil.fresk@gmail.com>
Date: Fri, 16 Sep 2016 14:03:55 +0200
Subject: [PATCH 11/59] Made AutoDiffJacobian more intuitive to use and updated
 for C++11

Changes:
* Removed unnecessary types from the Functor by inferring from its types
* Removed inputs() function reference, replaced with .rows()
* Updated the forward constructor to use variadic templates
* Added optional parameters to the Fuctor for passing parameters,
  control signals, etc
* Has been tested with fixed size and dynamic matricies

Ammendment by chtz: overload operator() for compatibility with not fully conforming compilers
---
 .../Eigen/src/AutoDiff/AutoDiffJacobian.h     | 51 ++++++++---
 unsupported/test/autodiff.cpp                 | 90 ++++++++++++++++++-
 2 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index 1a61e3367..33b6c393f 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -20,37 +20,60 @@ public:
   AutoDiffJacobian(const Functor& f) : Functor(f) {}
 
   // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... T>
+  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
   template<typename T0>
   AutoDiffJacobian(const T0& a0) : Functor(a0) {}
   template<typename T0, typename T1>
   AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
   template<typename T0, typename T1, typename T2>
   AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
-
-  enum {
-    InputsAtCompileTime = Functor::InputsAtCompileTime,
-    ValuesAtCompileTime = Functor::ValuesAtCompileTime
-  };
+#endif
 
   typedef typename Functor::InputType InputType;
   typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
-  typedef typename JacobianType::Scalar Scalar;
+  typedef typename ValueType::Scalar Scalar;
+
+  enum {
+    InputsAtCompileTime = InputType::RowsAtCompileTime,
+    ValuesAtCompileTime = ValueType::RowsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
   typedef typename JacobianType::Index Index;
 
-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
   typedef AutoDiffScalar<DerivativeType> ActiveScalar;
 
-
   typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE
+  void operator() (const InputType& x, ValueType* v) const
+  {
+      this->operator()(x, v, 0);
+  }
+  template<typename... ParamsType>
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+                   const ParamsType&... Params) const
+#else
   void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
   {
     eigen_assert(v!=0);
+
     if (!_jac)
     {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+      Functor::operator()(x, v, Params...);
+#else
       Functor::operator()(x, v);
+#endif
       return;
     }
 
@@ -61,12 +84,16 @@ public:
 
     if(InputsAtCompileTime==Dynamic)
       for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+        av[j].derivatives().resize(x.rows());
 
     for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    Functor::operator()(ax, &av, Params...);
+#else
     Functor::operator()(ax, &av);
+#endif
 
     for (Index i=0; i<jac.rows(); i++)
     {
@@ -74,8 +101,6 @@ public:
       jac.row(i) = av[i].derivatives();
     }
   }
-protected:
-
 };
 
 }
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 2da6dd8f3..85743137e 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -105,6 +105,89 @@ struct TestFunc1
   }
 };
 
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
 template<typename Func> void forward_jacobian(const Func& f)
 {
     typename Func::InputType x = Func::InputType::Random(f.inputs());
@@ -128,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_scalar()
@@ -141,6 +223,7 @@ void test_autodiff_scalar()
   VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
 }
 
+
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_vector()
@@ -151,7 +234,7 @@ void test_autodiff_vector()
   VectorAD ap = p.cast<AD>();
   ap.x().derivatives() = Vector2f::UnitX();
   ap.y().derivatives() = Vector2f::UnitY();
-  
+
   AD res = foo<VectorAD>(ap);
   VERIFY_IS_APPROX(res.value(), foo(p));
 }
@@ -164,6 +247,9 @@ void test_autodiff_jacobian()
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
 }
 
 

From ce3557ca69742af477546d031d644a6dab1ff614 Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Fri, 16 Sep 2016 14:24:47 +0200
Subject: [PATCH 12/59] Make makeHouseholder more stable for cases where
 real(c0) is not very small (but the rest is).

---
 Eigen/src/Householder/Householder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 4c1f499a1..31f804a99 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -77,7 +77,7 @@ void MatrixBase<Derived>::makeHouseholder(
   Scalar c0 = coeff(0);
   const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
+  if(tailSqNorm <= tol && numext::abs2(c0)<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);

From 4cc2c73e6ac9bf0a5d7ad59ad43627353c380b02 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 17 Sep 2016 12:52:27 +0200
Subject: [PATCH 13/59] Fix alignement of statically allocated temporaries in
 gemv.

---
 Eigen/src/Core/GeneralProduct.h | 18 +++++++++---------
 test/product_small.cpp          |  3 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index bff322b3c..a8c83f168 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
   enum {
     ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
     PacketSize      = internal::packet_traits<Scalar>::size
   };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() {
     return ForceAlignment
-            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
             : m_data.array;
   }
   #endif
@@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
   
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
     ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
     ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 3e8dab01e..0db50b949 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -213,7 +213,8 @@ void test_product_small()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
+    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
     CALL_SUBTEST_3( product(Matrix3d()) );
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );

From de05a18fe0d9903b77182df41155794248fa0b09 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 17 Sep 2016 14:13:48 +0200
Subject: [PATCH 14/59] fix compilation with boost::multiprec

---
 test/svd_fill.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/svd_fill.h b/test/svd_fill.h
index a1f752c66..5c2c61f8e 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 template<typename T>
-Array<T,4,1> four_denorms();
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
 
 template<>
 Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }

From bf03820339f45f9483ddbe5bb927ca3078fda19b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 17 Sep 2016 14:14:01 +0200
Subject: [PATCH 15/59] Silent warning.

---
 test/fastmath.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/fastmath.cpp b/test/fastmath.cpp
index 438e6b2e5..cc5db0746 100644
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
     VERIFY( !m.allFinite() );
     VERIFY(  m.hasNaN() );
   }
-  m(4) /= T(0.0);
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
+  m(4) /= hidden_zero;
   if(dryrun)
   {
     std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";

From 5dcc6d301a822db0fb95d5b9e010c0eb39d85bbc Mon Sep 17 00:00:00 2001
From: Hongkai Dai <daihongkai@gmail.com>
Date: Mon, 19 Sep 2016 10:30:30 -0700
Subject: [PATCH 16/59] remove ternary operator in euler angles

---
 Eigen/src/Geometry/EulerAngles.h                | 14 ++++++++++++--
 unsupported/Eigen/src/EulerAngles/EulerSystem.h | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h
index b875b7a13..4865e58aa 100644
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,i), coeff(k,i));
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
     {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
       res[1] = -atan2(s2, coeff(i,i));
     }
@@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,k), coeff(k,k));
     Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
       res[1] = atan2(-coeff(i,k), -c2);
     }
     else
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 82243e643..98f9f647d 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -189,7 +189,12 @@ namespace Eigen
       res[0] = atan2(mat(J,K), mat(K,K));
       Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
       if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
         res[1] = atan2(-mat(I,K), -c2);
       }
       else
@@ -212,7 +217,12 @@ namespace Eigen
       res[0] = atan2(mat(J,I), mat(K,I));
       if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
       {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
         Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
         res[1] = -atan2(s2, mat(I,I));
       }

From c3ca9b1e763ca74f953ee1cff43f2e0ba9d2edf9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 19 Sep 2016 11:33:39 -0700
Subject: [PATCH 17/59] Deleted some unecessary and confusing EIGEN_DEVICE_FUNC

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 93 ++++---------------
 1 file changed, 18 insertions(+), 75 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1468caa23..28c6f7626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -168,39 +168,20 @@ struct GpuDevice {
     return stream_->stream();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return stream_->allocate(num_bytes);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     stream_->deallocate(buffer);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* scratchpad() const {
     return stream_->scratchpad();
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
     return stream_->semaphore();
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -210,30 +191,22 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -242,21 +215,21 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE size_t numThreads() const {
     // FIXME
     return 32;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     // FIXME
     return 48*1024;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // We won't try to take advantage of the l2 cache for the time being, and
     // there is no l3 cache on cuda devices.
     return firstLevelCacheSize();
@@ -276,56 +249,26 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
     return stream_->deviceProperties().multiProcessorCount;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
     return stream_->deviceProperties().maxThreadsPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
     return stream_->deviceProperties().sharedMemPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     return stream_->deviceProperties().major;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
     return stream_->deviceProperties().minor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+  EIGEN_STRONG_INLINE int maxBlocks() const {
     return max_blocks_;
   }
 

From 59e9edfbf1e1a58d1e2401ec2f05b6fdd19fd87c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 19 Sep 2016 14:13:20 -0700
Subject: [PATCH 18/59] Removed EIGEN_DEVICE_FUNC qualifers for the lu(),
 fullPivLu(), partialPivLu(), and inverse() functions since they aren't ready
 to run on GPU

---
 Eigen/src/Core/MatrixBase.h | 4 ----
 Eigen/src/LU/FullPivLU.h    | 2 +-
 Eigen/src/LU/InverseImpl.h  | 2 +-
 Eigen/src/LU/PartialPivLU.h | 4 ++--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 19a7483ba..d56df8249 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -330,15 +330,11 @@ template<typename Derived> class MatrixBase
 
 /////////// LU module ///////////
 
-    EIGEN_DEVICE_FUNC
     inline const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC
     inline const PartialPivLU<PlainObject> partialPivLu() const;
 
-    EIGEN_DEVICE_FUNC
     inline const PartialPivLU<PlainObject> lu() const;
 
-    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index ebcd5c208..03b6af706 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -879,7 +879,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
   *
   * \sa class FullPivLU
   */
-template<typename Derived> EIGEN_DEVICE_FUNC
+template<typename Derived>
 inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::fullPivLu() const
 {
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 147f9496c..3134632e1 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -327,7 +327,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   *
   * \sa computeInverseAndDetWithCheck()
   */
-template<typename Derived> EIGEN_DEVICE_FUNC
+template<typename Derived>
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 13394fffa..d43961887 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -584,7 +584,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
   *
   * \sa class PartialPivLU
   */
-template<typename Derived> EIGEN_DEVICE_FUNC
+template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::partialPivLu() const
 {
@@ -599,7 +599,7 @@ MatrixBase<Derived>::partialPivLu() const
   *
   * \sa class PartialPivLU
   */
-template<typename Derived> EIGEN_DEVICE_FUNC
+template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::lu() const
 {

From b2c6dc48d9189eb96f878aa6028aec245eadde85 Mon Sep 17 00:00:00 2001
From: RJ Ryan <rjryan@google.com>
Date: Tue, 20 Sep 2016 07:18:20 -0700
Subject: [PATCH 19/59] Add CUDA-specific std::complex<T> specializations for
 scalar_sum_op, scalar_difference_op, scalar_product_op, and
 scalar_quotient_op.

---
 Eigen/Core                                    |  1 +
 Eigen/src/Core/arch/CUDA/Complex.h            | 80 +++++++++++++++
 unsupported/test/CMakeLists.txt               |  1 +
 .../cxx11_tensor_complex_cwise_ops_cuda.cu    | 97 +++++++++++++++++++
 4 files changed, 179 insertions(+)
 create mode 100644 Eigen/src/Core/arch/CUDA/Complex.h
 create mode 100644 unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu

diff --git a/Eigen/Core b/Eigen/Core
index 3ffd6220b..bf2479585 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -359,6 +359,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/ZVector/Complex.h"
 #endif
 
+#include "src/Core/arch/CUDA/Complex.h"
 // Half float support
 #include "src/Core/arch/CUDA/Half.h"
 #include "src/Core/arch/CUDA/PacketMathHalf.h"
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
new file mode 100644
index 000000000..aa511a4b2
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CUDA_H
+#define EIGEN_COMPLEX_CUDA_H
+
+// clang-format off
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, clang does not treat them as device
+// functions and thus Eigen functors making use of these operators fail to
+// compile. Here, we manually specialize these functors for complex types when
+// building for CUDA to avoid non-constexpr methods.
+
+template<typename T> struct scalar_sum_op<std::complex<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) + numext::real(b),
+                           numext::imag(a) + numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_difference_op<std::complex<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) - numext::real(b),
+                           numext::imag(a) - numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>> {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
+  };
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    return std::complex<T>(a_real * b_real - a_imag * b_imag,
+                           a_real * b_imag + a_imag * b_real);
+  }
+};
+
+template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>> {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+  };
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+                           (a_imag * b_real - a_real * b_imag) * norm);
+  }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_CUDA_H
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 714046809..9eac6ec73 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
   ei_add_test(cxx11_tensor_complex_cuda)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
   ei_add_test(cxx11_tensor_reduction_cuda)
   ei_add_test(cxx11_tensor_argmax_cuda)
   ei_add_test(cxx11_tensor_cast_float16_cuda)
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
new file mode 100644
index 000000000..54c17ca28
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add,
+    Sub,
+    Mul,
+    Div
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(2);
+  for (CwiseOp op : {Add, Sub, Mul, Div}) {
+    std::complex<T> expected;
+    switch (op) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_complex_cwise_ops()
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}

From 608b1acd6d43bff23b3b136c546d88c939e4d37d Mon Sep 17 00:00:00 2001
From: RJ Ryan <rjryan@google.com>
Date: Tue, 20 Sep 2016 07:49:05 -0700
Subject: [PATCH 20/59] Don't use c++11 features and fix include.

---
 .../test/cxx11_tensor_complex_cwise_ops_cuda.cu        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
index 54c17ca28..2baf5eaad 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -15,7 +15,7 @@
 #include <cuda_fp16.h>
 #endif
 #include "main.h"
-#include <Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
@@ -48,16 +48,16 @@ void test_cuda_complex_cwise_ops() {
   gpu_in2.device(gpu_device) = gpu_in2.constant(b);
 
   enum CwiseOp {
-    Add,
+    Add = 0,
     Sub,
     Mul,
     Div
   };
 
-  Tensor<std::complex<T>, 1, 0, int> actual(2);
-  for (CwiseOp op : {Add, Sub, Mul, Div}) {
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op <= Div; op++) {
     std::complex<T> expected;
-    switch (op) {
+    switch (static_cast<CwiseOp>(op)) {
       case Add:
         gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
         expected = a + b;

From 26f99075425cd9bf1db31d6d76a5b08570162bd2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 20 Sep 2016 12:58:03 -0700
Subject: [PATCH 21/59] Added missing typedefs

---
 Eigen/src/Core/arch/CUDA/Complex.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index aa511a4b2..f133b2db9 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -25,6 +25,8 @@ namespace internal {
 // building for CUDA to avoid non-constexpr methods.
 
 template<typename T> struct scalar_sum_op<std::complex<T>> {
+  typedef typename std::complex<T> result_type;
+
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     return std::complex<T>(numext::real(a) + numext::real(b),
@@ -33,6 +35,8 @@ template<typename T> struct scalar_sum_op<std::complex<T>> {
 };
 
 template<typename T> struct scalar_difference_op<std::complex<T>> {
+  typedef typename std::complex<T> result_type;
+
   EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     return std::complex<T>(numext::real(a) - numext::real(b),
@@ -44,6 +48,8 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>>
   enum {
     Vectorizable = packet_traits<std::complex<T>>::HasMul
   };
+  typedef typename std::complex<T> result_type;
+
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     const T a_real = numext::real(a);
@@ -59,6 +65,8 @@ template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>>
   enum {
     Vectorizable = packet_traits<std::complex<T>>::HasDiv
   };
+  typedef typename std::complex<T> result_type;
+
   EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     const T a_real = numext::real(a);

From 5269d11935b4169647d0a410c026728fa6f3708f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 21 Sep 2016 17:08:51 +0200
Subject: [PATCH 22/59] Fix compilation if ICC.

---
 test/svd_fill.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/svd_fill.h b/test/svd_fill.h
index 5c2c61f8e..3877c0c7e 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -8,12 +8,14 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 template<typename T>
-Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
+Array<T,4,1> four_denorms();
 
 template<>
 Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
 template<>
 Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+template<typename T>
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
 
 template<typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0)

From ac5377e16186c35ae1609245838825c07d0aa79f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 21 Sep 2016 17:26:04 +0200
Subject: [PATCH 23/59] Improve cost estimation of complex division

---
 Eigen/src/Core/util/XprHelper.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index fa60008ef..088a65240 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -671,6 +671,14 @@ struct scalar_div_cost {
   enum { value = 8*NumTraits<T>::MulCost };
 };
 
+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+
 
 template<bool Vectorized>
 struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };

From 9fa2c8650e06c964347311aaa571c06a07dca4bd Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 21 Sep 2016 17:34:24 +0200
Subject: [PATCH 24/59] Fix alignement of statically allocated temporaries in
 symv, and trmv.

---
 Eigen/src/Core/products/SelfadjointMatrixVector.h | 2 +-
 Eigen/src/Core/products/TriangularMatrixVector.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index d8d30267e..d97f8caa7 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
     
     eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
 
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index c11a983c7..4b292e74d 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
     
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
 
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);

From 1fc3a21ed0ca27aef0a900b8b49e3dcb086e5157 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 21 Sep 2016 20:09:07 +0200
Subject: [PATCH 25/59] Disable a failure test if extended double precision is
 in use (x87)

---
 test/cholesky.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index 9a1f3792c..e4af80fe2 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -417,6 +417,7 @@ void cholesky_faillure_cases()
     VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
     VERIFY(ldlt.info()==NumericalIssue);
   }
+#if (!EIGEN_ARCH_i386) || EIGEN_VECTORIZE_SSE2
   {
     mat.resize(3,3);
     mat << -1, -3, 3,
@@ -426,6 +427,7 @@ void cholesky_faillure_cases()
     VERIFY(ldlt.info()==NumericalIssue);
     VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
   }
+#endif
   {
     mat.resize(3,3);
     mat <<  1, 2, 3,

From aecc51a3e8f15ab28fb5fbfda04e027213a4c732 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 21 Sep 2016 21:53:00 +0200
Subject: [PATCH 26/59] fix typo

---
 test/cholesky.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index e4af80fe2..8ad5ac639 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -417,7 +417,7 @@ void cholesky_faillure_cases()
     VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
     VERIFY(ldlt.info()==NumericalIssue);
   }
-#if (!EIGEN_ARCH_i386) || EIGEN_VECTORIZE_SSE2
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
   {
     mat.resize(3,3);
     mat << -1, -3, 3,

From 4b377715d7e62ba898c0bbd25672523b14214ceb Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Thu, 22 Sep 2016 00:10:47 +0200
Subject: [PATCH 27/59] Do not manually add absolute path to boost-library.
 Also set C++ standard for blaze to C++14

---
 bench/btl/libs/blaze/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt
index f8b1b2ec3..e99a0855c 100644
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@@ -1,10 +1,13 @@
 
 find_package(BLAZE)
-find_package(Boost)
+find_package(Boost COMPONENTS system)
 if (BLAZE_FOUND AND Boost_FOUND)
   include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
   btl_add_bench(btl_blaze main.cpp)
+  # Note: The newest blaze version requires C++14.
+  # Ideally, we should set this depending on the version of Blaze we found
+  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
   if(BUILD_btl_blaze)
-    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
   endif()
 endif ()

From 66cbabafed7957a7f6c03b34df854149233de596 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 11:18:52 +0200
Subject: [PATCH 28/59] Add a note regarding gcc bug #72867

---
 Eigen/src/Core/MathFunctionsImpl.h | 8 ++++++--
 test/packetmath.cpp                | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 0c77ee003..3c9ef22fa 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
   // this range is +/-1.0f in single-precision.
   const T plus_9 = pset1<T>(9.f);
   const T minus_9 = pset1<T>(-9.f);
-  const T x = pmax(minus_9, pmin(plus_9, a_x));
-
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(4.89352455891786e-03f);
   const T alpha_3 = pset1<T>(6.37261928875436e-04f);
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 77514d8a0..1394d9f2b 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
   }
 
   if (PacketTraits::HasTanh) {
+    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
     packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
     h.store(data2, internal::ptanh(h.load(data1)));

From 8bde7da0862ca0fcd1b643170b69eba245a1750c Mon Sep 17 00:00:00 2001
From: Felix Gruber <gruber@igpm.rwth-aachen.de>
Date: Thu, 22 Sep 2016 14:50:07 +0200
Subject: [PATCH 29/59] fix documentation of LinSpaced

The index of the highest value in a LinSpace is size-1.
---
 Eigen/src/Core/CwiseNullaryOp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index e3f20894d..25c3ef3d7 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
   *
   * The function generates 'size' equally spaced values in the closed interval [low,high].
   * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
   * and yields faster code than the random access version.
   *
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
@@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
 /**
   * \brief Sets a linearly spaced vector.
   *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills *this with equally spaced values in the closed interval [low,high].
   * When size is set to 1, a vector of length 1 containing 'high' is returned.
   *
   * \only_for_vectors

From ca3746c6f8f788e4cc6ad9c88d1857c85be19a3b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 22:07:13 +0200
Subject: [PATCH 30/59] Bypass identity reflectors.

---
 Eigen/src/Householder/Householder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 31f804a99..e4d81629c 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
     Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     *this *= Scalar(1)-tau;
   }
-  else
+  else if(tau!=Scalar(0))
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);

From 50e3bbfc90e6f7c90a0b3fd046e09a1bb80b28a5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 22 Sep 2016 13:17:25 -0700
Subject: [PATCH 31/59] Calls x.imag() instead of imag(x) when x is a complex
 number since the former is a constexpr while the later isn't. This fixes
 compilation errors triggered by nvcc on Mac.

---
 Eigen/src/Core/MathFunctions.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index fa322aca7..4d8f8970e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1049,12 +1049,12 @@ double abs(const double &x) { return ::fabs(x); }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const std::complex<float>& x) {
-  return ::hypotf(real(x), imag(x));
+  return ::hypotf(x.real(), x.imag());
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const std::complex<double>& x) {
-  return ::hypot(real(x), imag(x));
+  return ::hypot(x.real(), x.imag());
 }
 #endif
 

From 9bcdc8b75669d2a2ec3a7e1fe6ae96854a0bc2e4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 22:27:54 +0200
Subject: [PATCH 32/59] Add a nullary-functor example performing index-based
 sub-matrices.

---
 doc/CustomizingEigen_NullaryExpr.dox | 27 ++++++++++++
 doc/examples/CMakeLists.txt          |  5 +++
 doc/examples/nullary_indexing.cpp    | 66 ++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 doc/examples/nullary_indexing.cpp

diff --git a/doc/CustomizingEigen_NullaryExpr.dox b/doc/CustomizingEigen_NullaryExpr.dox
index d70f81065..37c8dcd89 100644
--- a/doc/CustomizingEigen_NullaryExpr.dox
+++ b/doc/CustomizingEigen_NullaryExpr.dox
@@ -53,6 +53,33 @@ showing that the program works as expected:
 
 This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
 
+
+\section NullaryExpr_Indexing Example 2: indexing rows and columns
+
+The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
+
+\snippet nullary_indexing.out main1
+
+To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
+
+\snippet nullary_indexing.cpp functor
+
+Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
+
+\snippet nullary_indexing.cpp function
+
+Finally, here is an example of how this function can be used:
+
+\snippet nullary_indexing.cpp main1
+
+This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
+
+\snippet nullary_indexing.cpp main2
+
+and the output is:
+
+\snippet nullary_indexing.out main2
+
 */
 
 }
diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt
index 08cf8efd7..f7a19055f 100644
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
   )
   add_dependencies(all_examples ${example})
 endforeach(example_src)
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
+endif()
\ No newline at end of file
diff --git a/doc/examples/nullary_indexing.cpp b/doc/examples/nullary_indexing.cpp
new file mode 100644
index 000000000..e27c3585a
--- /dev/null
+++ b/doc/examples/nullary_indexing.cpp
@@ -0,0 +1,66 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [functor]
+template<class ArgType, class RowIndexType, class ColIndexType>
+class indexing_functor {
+  const ArgType &m_arg;
+  const RowIndexType &m_rowIndices;
+  const ColIndexType &m_colIndices;
+public:
+  typedef Matrix<typename ArgType::Scalar,
+                 RowIndexType::SizeAtCompileTime,
+                 ColIndexType::SizeAtCompileTime,
+                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
+                 RowIndexType::MaxSizeAtCompileTime,
+                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
+
+  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
+  {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    return m_arg(m_rowIndices[row], m_colIndices[col]);
+  }
+};
+// [functor]
+
+// [function]
+template <class ArgType, class RowIndexType, class ColIndexType>
+CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
+indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+{
+  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
+  typedef typename Func::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
+}
+// [function]
+
+
+int main()
+{
+  std::cout << "[main1]\n";
+  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
+  Array3i ri(1,2,1);
+  ArrayXi ci(6); ci << 3,2,1,0,0,2;
+  Eigen::MatrixXi B = indexing(A, ri, ci);
+  std::cout << "A =" << std::endl;
+  std::cout << A << std::endl << std::endl;
+  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
+  std::cout << B << std::endl;
+  std::cout << "[main1]\n";
+
+  std::cout << "[main2]\n";
+  B =  indexing(A, ri+1, ci);
+  std::cout << "A(ri+1,ci) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#if __cplusplus >= 201103L
+  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#endif
+  std::cout << "[main2]\n";
+}
+

From ba0f844d6b8cc26cf315311f536239dbbd330464 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 22:28:51 +0200
Subject: [PATCH 33/59] Backout changeset
 ce3557ca69742af477546d031d644a6dab1ff614

---
 Eigen/src/Householder/Householder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 31f804a99..4c1f499a1 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -77,7 +77,7 @@ void MatrixBase<Derived>::makeHouseholder(
   Scalar c0 = coeff(0);
   const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm <= tol && numext::abs2(c0)<=tol)
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);

From 77e27fbeee7acb289d7df809fc09a8cc8ee94eb7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 22:37:39 +0200
Subject: [PATCH 34/59] bump to 3.3-rc1

---
 Eigen/src/Core/util/Macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index c1049f7b2..d65f92532 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -13,7 +13,7 @@
 
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 93
+#define EIGEN_MINOR_VERSION 94
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \

From 3946768916bfd4d22017af2db999c0bf61e9614c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 22 Sep 2016 22:38:36 +0200
Subject: [PATCH 35/59] Added tag 3.3-rc1 for changeset
 77e27fbeee7acb289d7df809fc09a8cc8ee94eb7


From 2a69290ddb165b7103c87ba8f5b257eca23f62aa Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 22 Sep 2016 15:52:23 -0700
Subject: [PATCH 36/59] Added a specialization of Eigen::numext::real and
 Eigen::numext::imag for std::complex<T> to be used when compiling a cuda
 kernel. This is unfortunately necessary to be able to process complex numbers
 from a CUDA kernel on MacOS.

---
 Eigen/src/Core/MathFunctions.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4d8f8970e..8d47fb8a4 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct real_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.real();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct real_retval
 {
@@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
+#ifdef __CUDA_ARCH__
+template<typename T>
+struct imag_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.imag();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct imag_retval
 {

From 1301d744f8dd1215336371046bd7c68b62dac810 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 22 Sep 2016 19:04:44 -0700
Subject: [PATCH 37/59] Made the gaussian generator usable on GPU

---
 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 1ba8d6328..760074622 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -895,7 +895,7 @@ class GaussianGenerator {
     }
   }
 
-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];

From b9f7a17e47b88bb2b63909e9863148b42e8d8c15 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 23 Sep 2016 10:26:08 +0200
Subject: [PATCH 38/59] Add missing file.

---
 doc/examples/make_circulant2.cpp | 52 ++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 doc/examples/make_circulant2.cpp

diff --git a/doc/examples/make_circulant2.cpp b/doc/examples/make_circulant2.cpp
new file mode 100644
index 000000000..95d3dd31a
--- /dev/null
+++ b/doc/examples/make_circulant2.cpp
@@ -0,0 +1,52 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [circulant_func]
+template<class ArgType>
+class circulant_functor {
+  const ArgType &m_vec;
+public:
+  circulant_functor(const ArgType& arg) : m_vec(arg) {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    Index index = row - col;
+    if (index < 0) index += m_vec.size();
+    return m_vec(index);
+  }
+};
+// [circulant_func]
+
+// [square]
+template<class ArgType>
+struct circulant_helper {
+  typedef Matrix<typename ArgType::Scalar,
+                 ArgType::SizeAtCompileTime,
+                 ArgType::SizeAtCompileTime,
+                 ColMajor,
+                 ArgType::MaxSizeAtCompileTime,
+                 ArgType::MaxSizeAtCompileTime> MatrixType;
+};
+// [square]
+
+// [makeCirculant]
+template <class ArgType>
+CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
+makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
+}
+// [makeCirculant]
+
+// [main]
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
+// [main]

From 86caba838db00ed977ed160a893d0a749856a215 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 23 Sep 2016 13:41:21 +0200
Subject: [PATCH 39/59] bug #1304: fix Projective * scaling and Projective *=
 scaling

---
 Eigen/src/Geometry/Transform.h   |  6 +--
 Eigen/src/Geometry/Translation.h |  6 ++-
 test/geo_transformations.cpp     | 79 ++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index db5fd07c3..8f6c62d63 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -464,7 +464,7 @@ public:
     operator * (const DiagonalBase<DiagonalDerived> &b) const
   {
     TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
     return res;
   }
 
@@ -578,7 +578,7 @@ public:
     return res;
   }
 
-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
 
   template<typename Derived>
   inline Transform& operator=(const RotationBase<Derived,Dim>& r);
@@ -853,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
   EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  affine().noalias() = (other.asDiagonal() * affine());
   return *this;
 }
 
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 82d7777f0..b9b9a590c 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -130,8 +130,10 @@ public:
   }
 
   /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
+  template<typename Derived>
+  inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
+  operator* (const MatrixBase<Derived>& vec) const
+  { return m_coeffs + vec.derived(); }
 
   /** \returns the inverse translation (opposite) */
   Translation inverse() const { return Translation(-m_coeffs); }
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 12a9aece1..278e527c2 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -334,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
   t0.scale(v0);
   t1 *= AlignedScaling3(v0);
   VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
+  t1 = t1 * v0.asDiagonal();
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
   // transformation * translation
   t0.translate(v0);
   t1 = t1 * Translation3(v0);
@@ -482,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
     Rotation2D<Scalar> r2(r1);       // copy ctor
     VERIFY_IS_APPROX(r2.angle(),s0);
   }
+
+  {
+    Transform3 t32(Matrix4::Random()), t33, t34;
+    t34 = t33 = t32;
+    t32.scale(v0);
+    t33*=AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+    t33 = t34 * AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+  }
+
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
+  VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
+  VERIFY_IS_APPROX( q*(p*h).hnormalized(),  ((q*p)*h).hnormalized() );
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
+  VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
+  VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
+
+  transform_associativity_left(a1, a2,p, q, v, h);
+}
+
+template<typename Scalar, int Dim, int Options,typename RotationType>
+void transform_associativity(const RotationType& R)
+{
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar,Dim+1,1> HVectorType;
+  typedef Matrix<Scalar,Dim,Dim> LinearType;
+  typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
+  typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
+  typedef Transform<Scalar,Dim,Affine,Options> AffineType;
+  typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
+  typedef DiagonalMatrix<Scalar,Dim> ScalingType;
+  typedef Translation<Scalar,Dim> TranslationType;
+
+  AffineCompactType A1c; A1c.matrix().setRandom();
+  AffineCompactType A2c; A2c.matrix().setRandom();
+  AffineType A1(A1c);
+  AffineType A2(A2c);
+  ProjectiveType P1; P1.matrix().setRandom();
+  VectorType v1 = VectorType::Random();
+  VectorType v2 = VectorType::Random();
+  HVectorType h1 = HVectorType::Random();
+  Scalar s1 = internal::random<Scalar>();
+  LinearType L = LinearType::Random();
+  MatrixType M = MatrixType::Random();
+
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
+
+  VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
+  VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
+  VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
+
+  VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
+  VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
+  VERIFY_IS_APPROX( M*(P1*h1),  ((M*P1)*h1) );
 }
 
 template<typename Scalar> void transform_alignment()
@@ -562,5 +638,8 @@ void test_geo_transformations()
 
     CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
     CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
+
+    CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
+    CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
   }
 }

From fe29157d02ad524fddf36fee92102f74a38cad40 Mon Sep 17 00:00:00 2001
From: Sergiu Deitsch <sergiu.deitsch@gmail.com>
Date: Sun, 25 Sep 2016 14:25:47 +0200
Subject: [PATCH 40/59] disabled MSVC level 4 warning C4714

The level 4 warning (/W4) warns about functions marked as __forceinline not
inlined, and generates a lot of noise.
---
 Eigen/src/Core/util/DisableStupidWarnings.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index b13e5da25..970bbabc7 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -14,12 +14,13 @@
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
   // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
   // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)

From 48dfe98abd00f2cb9b62d157f805f69d01b7892b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 25 Sep 2016 14:54:35 +0200
Subject: [PATCH 41/59] bug #1308: fix compilation of vector *
 rowvector::nullary.

---
 Eigen/src/Core/ProductEvaluators.h |  4 ++--
 test/product_extra.cpp             | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index b8f92a3dc..8a079fed1 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -265,7 +265,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
   for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }
 
 // Row major result
@@ -278,7 +278,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
   for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }
 
 template<typename Lhs, typename Rhs>
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index d253fd7ed..39abe82bb 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -256,6 +256,20 @@ Index compute_block_size()
   return ret;
 }
 
+
+
+template<int>
+void bug_1308()
+{
+  int n = 10;
+  MatrixXd r(n,n);
+  VectorXd v = VectorXd::Random(n);
+  r = v * RowVectorXd::Ones(n);
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
+  r = VectorXd::Ones(n) * v.transpose();
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+}
+
 void test_product_extra()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -268,8 +282,10 @@ void test_product_extra()
   }
   CALL_SUBTEST_5( bug_127<0>() );
   CALL_SUBTEST_5( bug_817<0>() );
+  CALL_SUBTEST_5( bug_1308<0>() );
   CALL_SUBTEST_6( unaligned_objects<0>() );
   CALL_SUBTEST_7( compute_block_size<float>() );
   CALL_SUBTEST_7( compute_block_size<double>() );
   CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
+
 }

From 6565f8d60fc0cadd1935ab2dee522ec52829e615 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 26 Sep 2016 11:00:32 -0700
Subject: [PATCH 42/59] Made the initialization of a CUDA device thread safe.

---
 unsupported/Eigen/CXX11/Tensor                |  4 +++
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 28 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index f7b94cee1..1d9f89864 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -64,6 +64,10 @@ typedef unsigned __int64 uint64_t;
 #if defined(__CUDACC__)
 #include <curand_kernel.h>
 #endif
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
+#endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 28c6f7626..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;
 
 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
-    if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
       if (status != cudaSuccess) {
@@ -63,7 +77,19 @@ static void initializeDeviceProp() {
           assert(status == cudaSuccess);
         }
       }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
       m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
     }
   }
 }

From 779774f98cea76a0f44b4b53b89e5195582363bd Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 26 Sep 2016 23:53:40 +0200
Subject: [PATCH 43/59] bug #1311: fix alignment logic in some cases of
 (scalar*small).lazyProduct(small)

---
 Eigen/src/Core/ProductEvaluators.h |  4 ++--
 test/product_small.cpp             | 24 +++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 8a079fed1..942e3d832 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -503,8 +503,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
     RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
 
-    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
               : 0,
 
     /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 0db50b949..fdfdd9f6c 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/LU>
 
 // regression test for bug 447
+template<int>
 void product1x1()
 {
   Matrix<float,1,3> matAstatic;
@@ -209,6 +210,24 @@ void test_linear_but_not_vectorizable()
   }
 }
 
+template<int Rows>
+void bug_1311()
+{
+  Matrix< double, Rows, 2 > A;  A.setRandom();
+  Vector2d b = Vector2d::Random() ;
+  Matrix<double,Rows,1> res;
+  res.noalias() = 1. * (A * b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = 1.*A * b;
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+}
+
 void test_product_small()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -218,7 +237,7 @@ void test_product_small()
     CALL_SUBTEST_3( product(Matrix3d()) );
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );
-    CALL_SUBTEST_6( product1x1() );
+    CALL_SUBTEST_6( product1x1<0>() );
 
     CALL_SUBTEST_11( test_lazy_l1<float>() );
     CALL_SUBTEST_12( test_lazy_l2<float>() );
@@ -239,6 +258,9 @@ void test_product_small()
     CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
     CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
     CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
+
+    CALL_SUBTEST_6( bug_1311<3>() );
+    CALL_SUBTEST_6( bug_1311<5>() );
   }
 
 #ifdef EIGEN_TEST_PART_6

From 892afb9416927bb837015fe01c5fd8588bfe49c0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 26 Sep 2016 23:53:57 +0200
Subject: [PATCH 44/59] Add debug info.

---
 Eigen/src/Core/ProductEvaluators.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 942e3d832..9e1862c5a 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -437,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
   }
 
   // Everything below here is taken from CoeffBasedProduct.h

From 2bda1b0d93fb627d0c500ec48b20302d44c32cb7 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 28 Sep 2016 17:08:41 -0700
Subject: [PATCH 45/59] Updated the tensor sum and mean reducer to enable them
 to process complex numbers on cuda gpus.

---
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   |  6 ++-
 unsupported/test/cxx11_tensor_complex_cuda.cu | 37 +++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 760074622..eddb86597 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -99,7 +99,8 @@ template <typename T> struct SumReducer
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
     scalarCount_++;
   }
   template <typename Packet>
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
index 74befe670..f895efd01 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -71,8 +71,45 @@ void test_cuda_nullary() {
 }
 
 
+static void test_cuda_sum_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
 
 void test_cxx11_tensor_complex()
 {
   CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
 }

From 27d7628f16d704c8ebc8b834ac13ec27c4cc7a79 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 28 Sep 2016 17:42:59 -0700
Subject: [PATCH 46/59] Updated the list of warnings to reflect the new message
 ids introduced in cuda 8.0

---
 Eigen/src/Core/util/DisableStupidWarnings.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 970bbabc7..7559e129c 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -68,6 +68,8 @@
   #pragma diag_suppress 2669
   #pragma diag_suppress 2670
   #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
 #endif
 
 #endif // not EIGEN_WARNINGS_DISABLED

From 33500050c369bd5ecb4167870a8205619642e0c9 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 29 Sep 2016 09:40:44 +0200
Subject: [PATCH 47/59] bug #1308: fix compilation of some small products
 involving nullary-expressions.

---
 Eigen/src/Core/ProductEvaluators.h |  8 ++++----
 test/product_extra.cpp             | 31 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 9e1862c5a..468e531cb 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -602,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
   }
 };
 
@@ -612,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
   }
 };
 
@@ -621,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
   }
 };
 
@@ -630,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
 };
 
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index 39abe82bb..e4990ac8c 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -268,6 +268,37 @@ void bug_1308()
   VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
   r = VectorXd::Ones(n) * v.transpose();
   VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+
+  Matrix4d ones44 = Matrix4d::Ones();
+  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
+  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+//   RowVector4d r4;
+  m44.setOnes();
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
 }
 
 void test_product_extra()

From 3860a0bc8f6af25d9115a66db1bdd3d7cad9b99a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 29 Sep 2016 23:23:35 +0200
Subject: [PATCH 48/59] bug #1312: Quaternion to AxisAngle conversion now
 ensures the angle will be in the range [-pi,pi]. This also increases accuracy
 when q.w is negative.

---
 Eigen/src/Geometry/AngleAxis.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 7fdb8ae83..99f3c3a66 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The resulting axis is normalized.
+  *
+  * The resulting axis is normalized, and the the computed angle is in the [-pi,pi] range.
   * 
   * This function implicitly normalizes the quaternion \a q.
   */
@@ -167,12 +168,16 @@ template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
   using std::atan2;
+  using std::abs;
   Scalar n = q.vec().norm();
   if(n<NumTraits<Scalar>::epsilon())
     n = q.vec().stableNorm();
-  if (n > Scalar(0))
+
+  if (n != Scalar(0))
   {
-    m_angle = Scalar(2)*atan2(n, q.w());
+    m_angle = Scalar(2)*atan2(n, std::abs(q.w()));
+    if(q.w() < 0)
+      n = -n;
     m_axis  = q.vec() / n;
   }
   else

From 27f3970453889391f21f1a3f3e90b2d98b63b123 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 30 Sep 2016 12:40:41 +0200
Subject: [PATCH 49/59] Remove std:: prefix

---
 Eigen/src/Geometry/AngleAxis.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 99f3c3a66..882df856d 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -175,7 +175,7 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived
 
   if (n != Scalar(0))
   {
-    m_angle = Scalar(2)*atan2(n, std::abs(q.w()));
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
     if(q.w() < 0)
       n = -n;
     m_axis  = q.vec() / n;

From 67b4f458365a5813e1f5eeedb6ba7a9e157a5dd4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 30 Sep 2016 12:46:33 +0200
Subject: [PATCH 50/59] Fix angle range

---
 Eigen/src/Geometry/AngleAxis.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 882df856d..571062d00 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -159,7 +159,7 @@ typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
   *
-  * The resulting axis is normalized, and the the computed angle is in the [-pi,pi] range.
+  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
   * 
   * This function implicitly normalizes the quaternion \a q.
   */

From 8b84801f7f5e66d4a81f10d75ff70a3526d2fefc Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 30 Sep 2016 22:49:59 +0200
Subject: [PATCH 51/59] bug #1310: workaround a compilation regression from 3.2
 regarding triangular * homogeneous

---
 Eigen/src/Geometry/Homogeneous.h | 12 ++++++++++++
 test/geo_homogeneous.cpp         |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 4e2213b33..a23068c8d 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
   }
 };
 
+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
 template<typename Lhs,typename Rhs>
 struct homogeneous_left_product_refactoring_helper
 {
diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index 305794cdf..2187c7bf9 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -111,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
   
   VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
   VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
+
+  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 
 void test_geo_homogeneous()

From 9d6d0dff8f0c1e8630996c3a4867ff0599566b33 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 1 Oct 2016 15:37:00 +0200
Subject: [PATCH 52/59] bug #1317: fix performance regression with some Block
 expressions and clang by helping it to remove dead code. The trick is to get
 rid of the nested expression in the evaluator by copying only the required
 information (here, the strides).

---
 Eigen/src/Core/CoreEvaluators.h | 52 ++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 7a5540593..00c079bd8 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -817,73 +817,79 @@ struct mapbase_evaluator : evaluator_base<Derived>
     ColsAtCompileTime = XprType::ColsAtCompileTime,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
-  
+
   EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
-    : m_data(const_cast<PointerType>(map.data())),  
-      m_xpr(map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
   {
     EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
   }
- 
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const 
+  PacketType packet(Index row, Index col) const
   {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
     return internal::ploadt<PacketType, LoadMode>(ptr);
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const 
+  PacketType packet(Index index) const
   {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index row, Index col, const PacketType& x) 
+  void writePacket(Index row, Index col, const PacketType& x)
   {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
     return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketType& x) 
+  void writePacket(Index index, const PacketType& x)
   {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
   }
- 
 protected:
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+
   PointerType m_data;
-  const XprType& m_xpr;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
 template<typename PlainObjectType, int MapOptions, typename StrideType> 

From 409e887d785012afdc4a4e661b9b78e8990e2623 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 3 Oct 2016 11:06:24 -0700
Subject: [PATCH 53/59] Added support for constand std::complex numbers on GPU

---
 Eigen/Core                         |  5 ++++-
 Eigen/src/Core/arch/CUDA/Complex.h | 31 ++++++++++++++++++++++--------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index bf2479585..fcc107721 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -359,7 +359,6 @@ using std::ptrdiff_t;
   #include "src/Core/arch/ZVector/Complex.h"
 #endif
 
-#include "src/Core/arch/CUDA/Complex.h"
 // Half float support
 #include "src/Core/arch/CUDA/Half.h"
 #include "src/Core/arch/CUDA/PacketMathHalf.h"
@@ -379,6 +378,10 @@ using std::ptrdiff_t;
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"
 
+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#include "src/Core/arch/CUDA/Complex.h"
+
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index f133b2db9..9c2536509 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -24,34 +24,43 @@ namespace internal {
 // compile. Here, we manually specialize these functors for complex types when
 // building for CUDA to avoid non-constexpr methods.
 
-template<typename T> struct scalar_sum_op<std::complex<T>> {
+// Sum
+template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
   typedef typename std::complex<T> result_type;
 
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     return std::complex<T>(numext::real(a) + numext::real(b),
                            numext::imag(a) + numext::imag(b));
   }
 };
 
-template<typename T> struct scalar_difference_op<std::complex<T>> {
+template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Difference
+template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
   typedef typename std::complex<T> result_type;
 
   EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     return std::complex<T>(numext::real(a) - numext::real(b),
                            numext::imag(a) - numext::imag(b));
   }
 };
 
-template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>> {
+template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Product
+template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
     Vectorizable = packet_traits<std::complex<T>>::HasMul
   };
   typedef typename std::complex<T> result_type;
 
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     const T a_real = numext::real(a);
     const T a_imag = numext::imag(a);
     const T b_real = numext::real(b);
@@ -61,14 +70,18 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>>
   }
 };
 
-template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>> {
+template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Quotient
+template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
     Vectorizable = packet_traits<std::complex<T>>::HasDiv
   };
   typedef typename std::complex<T> result_type;
 
   EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
     const T a_real = numext::real(a);
     const T a_imag = numext::imag(a);
     const T b_real = numext::real(b);
@@ -79,6 +92,8 @@ template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>>
   }
 };
 
+template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+
 #endif
 
 } // end namespace internal

From 616a7a19129dfe1b64a4e578cd35e84861d8046b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 3 Oct 2016 17:09:33 -0700
Subject: [PATCH 54/59] Improved support for compiling CUDA code with clang as
 the host compiler

---
 test/CMakeLists.txt             | 2 +-
 unsupported/test/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 926b284e6..e17985107 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -355,7 +355,7 @@ if(CUDA_FOUND)
   
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 9eac6ec73..a1823beaa 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -203,7 +203,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
 
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")

From 881b90e98483c8ec8dcb900efb2c681854e46d48 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 4 Oct 2016 08:23:38 -0700
Subject: [PATCH 55/59] Use explicit type casting to generate packets of zeros.

---
 Eigen/src/Core/ProductEvaluators.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 468e531cb..63faca822 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -639,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
@@ -648,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
 };
 
@@ -657,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
   }
@@ -668,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }

From 2f6d1607c84bd828e77a44465e0dccfd3524d7a6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 4 Oct 2016 08:38:23 -0700
Subject: [PATCH 56/59] Cleaned up the random number generation code.

---
 unsupported/Eigen/CXX11/Tensor                |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   | 442 ------------------
 .../Eigen/CXX11/src/Tensor/TensorRandom.h     | 276 +++++++++++
 3 files changed, 277 insertions(+), 445 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 1d9f89864..4976a1254 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -61,9 +61,6 @@ typedef unsigned __int64 uint64_t;
 #ifdef EIGEN_USE_GPU
 #include <iostream>
 #include <cuda_runtime.h>
-#if defined(__CUDACC__)
-#include <curand_kernel.h>
-#endif
 #if __cplusplus >= 201103L
 #include <atomic>
 #include <unistd.h>
@@ -83,6 +80,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorDimensions.h"
 #include "src/Tensor/TensorInitializer.h"
 #include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
 #include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
 #include "src/Tensor/TensorGlobalFunctions.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index eddb86597..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -441,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
 };
 
 
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
-    return clock();
-}
-#else
-static inline int get_random_seed() {
-#ifdef _WIN32
-    SYSTEMTIME st;
-    GetSystemTime(&st);
-    return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
-    return static_cast<int>(mach_absolute_time());
-#else
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    if (!deterministic) {
-      srand(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-  }
-
-  T operator()() const {
-    return random<T>();
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = random<T>();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other() * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  float operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX float values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other() * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  double operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX double values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-     curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ float operator()() const {
-    return curand_uniform(&m_state);
-  }
-  template<typename PacketType>
-  __device__ float4 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ double operator()() const {
-    return curand_uniform_double(&m_state);
-  }
-  template<typename PacketType>
-  __device__ double2 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<float> operator()() const {
-    float4 vals = curand_uniform4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<double> operator()() const {
-    double2 vals = curand_uniform2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  NormalRandomGenerator(const NormalRandomGenerator& other)
-      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other() * UINT_MAX);
-  }
-  ~NormalRandomGenerator() {
-    delete m_generator;
-  }
-  T operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = m_distribution(*m_generator);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  // No assignment
-  NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
-  bool m_deterministic;
-  mutable std::normal_distribution<T> m_distribution;
-  std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ float operator()() const {
-    return curand_normal(&m_state);
-  }
-  template<typename PacketType>
-   __device__ float4 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ double operator()() const {
-    return curand_normal_double(&m_state);
-  }
-  template<typename PacketType>
-  __device__ double2 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<float> operator()() const {
-    float4 vals = curand_normal4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<double> operator()() const {
-    double2 vals = curand_normal2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = false;
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
-  bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
 template <typename T, typename Index, size_t NumDims>
 class GaussianGenerator {
  public:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..9b16e68f5
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  assert(threadIdx.z == 0);
+  return clock64() +
+      blockIdx.x * blockDim.x + threadIdx.x +
+      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+  // Use the current time as a baseline.
+  GetSystemTime(&st);
+  int time = st.wSecond + 1000 * st.wMilliseconds;
+  // Mix in a random number to make sure that we get different seeds if
+  // we try to generate seeds faster than the clock resolution.
+  // We need 2 random values since the generator only generate 16 bits at
+  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+  SYSTEMTIME st;
+  uint rnd1 = ::rand();
+  uint rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+  return rnd;
+
+#elif defined __APPLE__
+  // Same approach as for win32, except that the random number generator
+  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+  uint64_t rnd = ::random() ^ mach_absolute_time();
+  return rnd;
+
+#else
+  // Augment the current time with pseudo random number generation
+  // to ensure that we get different seeds if we try to generate seeds
+  // faster than the clock resolution.
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  uint64_t rnd = ::random() ^ ts.tv_nsec;
+  return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return (current ^ (current >> 22)) >> (22 + (current >> 61));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+}  // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+  Eigen::half result;
+  // Generate 10 random bits for the mantissa
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+  // Set the exponent
+  result.x |= (static_cast<uint16_t>(15) << 10);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost *
+           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+  } while (q > T(0.27597) &&
+           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+  return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+      const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

From 6af5ac7e2749bdea7a31323855ef3b4333b91c3e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 4 Oct 2016 08:52:13 -0700
Subject: [PATCH 57/59] Cleanup the cuda executor code.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index a116bf17f..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
 
-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
-
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
   EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }

From 698ff694504b5938e56268707d275cffd122d331 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 4 Oct 2016 16:53:30 -0700
Subject: [PATCH 58/59] Properly characterize the CUDA packet primitives for
 fp16 as device only

---
 Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 70 +++++++++++------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 84ddcea2a..82dfc12c9 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
   return __half2half2(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
   return *reinterpret_cast<const half2*>(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
   return __halves2half2(from[0], from[1]);
 }
 
@@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
   return __halves2half2(from[0], from[0]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }
 
 template<>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
    return __ldg((const half2*)from);
 #else
@@ -76,7 +76,7 @@ template<>
 }
 
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
    return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
@@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
   return __halves2half2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
   to[stride*0] = __low2half(from);
   to[stride*1] = __high2half(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
   return __low2half(a);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
   half2 result;
   result.x = a.x & 0x7FFF7FFF;
   return result;
 }
 
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+__device__ EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
   __half a1 = __low2half(kernel.packet[0]);
   __half a2 = __high2half(kernel.packet[0]);
@@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
@@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
   return __hadd2(a, b);
 #else
@@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
   return __hsub2(a, b);
 #else
@@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
   return __hneg2(a);
 #else
@@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
   return __hmul2(a, b);
 #else
@@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
    return __hfma2(a, b, c);
 #else
@@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
   return __floats2half2_rn(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
   return __halves2half2(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
   return __halves2half2(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
   return __hadd(__low2half(a), __high2half(a));
 #else
@@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
@@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
@@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
   return __hmul(__low2half(a), __high2half(a));
 #else
@@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = log1pf(a1);
@@ -277,29 +277,29 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2
 
 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
 
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<>  __device__ EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
   return h2log(a);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
   return h2exp(a);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
   return h2sqrt(a);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
   return h2rsqrt(a);
 }
 
 #else
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = logf(a1);
@@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
   return __floats2half2_rn(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = expf(a1);
@@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
   return __floats2half2_rn(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = sqrtf(a1);
@@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
   return __floats2half2_rn(r1, r2);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = rsqrtf(a1);

From ceee1c008b6d618a48846283e1f18ba1b4cc171a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 4 Oct 2016 18:47:53 -0700
Subject: [PATCH 59/59] Silenced compilation warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 9b16e68f5..dd369fb35 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -61,7 +61,7 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint6
   // Update the internal state
   *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
   // Generate the random output (using the PCG-XSH-RS scheme)
-  return (current ^ (current >> 22)) >> (22 + (current >> 61));
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
 
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {