Pull the latest updates from trunk

2025-09-16 03:13:14 +08:00 · 2016-10-05 14:54:36 -07:00 · 2016-10-05 14:54:36 -07:00 · ae1385c7e4
commit ae1385c7e4
parent 73b0012945 ceee1c008b
59 changed files with 1930 additions and 1250 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -411,6 +411,10 @@ using std::ptrdiff_t;
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
 #include "src/Core/arch/CUDA/Complex.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -87,6 +87,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@ -97,6 +98,7 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -820,7 +820,8 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
    : m_data(const_cast<PointerType>(map.data())),
-      m_xpr(map)
+      m_innerStride(map.innerStride()),
      m_outerStride(map.outerStride())
  {
    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
@ -830,32 +831,32 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }
  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
  PacketType packet(Index row, Index col) const
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::ploadt<PacketType, LoadMode>(ptr);
  }
@ -863,14 +864,14 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const
  {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
  }
  template<int StoreMode, typename PacketType>
  EIGEN_STRONG_INLINE
  void writePacket(Index row, Index col, const PacketType& x)
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
  }
@ -878,12 +879,17 @@ struct mapbase_evaluator : evaluator_base<Derived>
  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x)
  {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
  }
 protected:
  EIGEN_DEVICE_FUNC
  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
  EIGEN_DEVICE_FUNC
  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
  PointerType m_data;
-  const XprType& m_xpr;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 template<typename PlainObjectType, int MapOptions, typename StrideType> 
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
  *
  * The function generates 'size' equally spaced values in the closed interval [low,high].
  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
  * and yields faster code than the random access version.
  *
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
 /**
  * \brief Sets a linearly spaced vector.
  *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills *this with equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
  *
  * \only_for_vectors
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -558,12 +558,15 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC void reverseInPlace();
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
    // disable the use of evalTo for dense objects with a nice compilation error
    template<typename Dest>
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
  #else
  // Some architectures cannot align on the stack,
  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  enum {
    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
    PacketSize      = internal::packet_traits<Scalar>::size
  };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
  #else
  // Some architectures cannot align on the stack,
  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@ -50,7 +50,7 @@ public:
  typedef typename internal::ref_selector<Inverse>::type Nested;
  typedef typename internal::remove_all<XprType>::type NestedExpression;
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
    : m_xpr(xpr)
  {}
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 #ifdef __CUDA_ARCH__
 template<typename T>
 struct real_impl<std::complex<T> >
 {
  typedef T RealScalar;
  EIGEN_DEVICE_FUNC
  static inline T run(const std::complex<T>& x)
  {
    return x.real();
  }
 };
 #endif
 template<typename Scalar>
 struct real_retval
 {
@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 #ifdef __CUDA_ARCH__
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
  typedef T RealScalar;
  EIGEN_DEVICE_FUNC
  static inline T run(const std::complex<T>& x)
  {
    return x.imag();
  }
 };
 #endif
 template<typename Scalar>
 struct imag_retval
 {
@ -1049,12 +1075,12 @@ double abs(const double &x) { return ::fabs(x); }
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const std::complex<float>& x) {
-  return ::hypotf(real(x), imag(x));
+  return ::hypotf(x.real(), x.imag());
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const std::complex<double>& x) {
-  return ::hypot(real(x), imag(x));
+  return ::hypot(x.real(), x.imag());
 }
 #endif
@ -1312,11 +1338,12 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
  }
  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
  // this range is +/-1.0f in single-precision.
  const T plus_9 = pset1<T>(9.f);
  const T minus_9 = pset1<T>(-9.f);
-  const T x = pmax(minus_9, pmin(plus_9, a_x));
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
-
+  //      step such that if a_x is nan, x will be either 9 or -9,
  //      and tanh will return 1 or -1 instead of nan.
  //      This is supposed to be fixed in gcc6.3,
  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
  const T x = pmax(minus_9,pmin(plus_9,a_x));
  // The monomial coefficients of the numerator polynomial (odd).
  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -98,7 +98,7 @@ template<typename Derived> class MatrixBase
    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
    EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
    typedef typename Base::PlainObject PlainObject;
@ -121,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@ -129,6 +130,7 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
@ -328,15 +330,11 @@ template<typename Derived> class MatrixBase
 /////////// LU module ///////////
    EIGEN_DEVICE_FUNC
    inline const FullPivLU<PlainObject> fullPivLu() const;
    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> partialPivLu() const;
    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> lu() const;
    EIGEN_DEVICE_FUNC
    inline const Inverse<Derived> inverse() const;
    template<typename ResultType>
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -265,7 +265,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dst.cols();
  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }
 // Row major result
@ -278,7 +278,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dst.rows();
  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }
 template<typename Lhs, typename Rhs>
@ -437,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
 #if 0
    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
    std::cerr << "Alignment=            " << Alignment << "\n";
    std::cerr << "Flags=                " << Flags << "\n";
 #endif
  }
  // Everything below here is taken from CoeffBasedProduct.h
@ -503,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
-    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
              : 0,
    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@ -590,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
  }
 };
@ -600,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
  }
 };
@ -609,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
  }
 };
@ -618,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
  }
 };
@ -627,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };
@ -636,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };
@ -645,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
  }
@ -656,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@ -0,0 +1,103 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
 // clang-format off
 namespace Eigen {
 namespace internal {
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
 // functions and thus Eigen functors making use of these operators fail to
 // compile. Here, we manually specialize these functors for complex types when
 // building for CUDA to avoid non-constexpr methods.
 // Sum
 template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) + numext::real(b),
                           numext::imag(a) + numext::imag(b));
  }
 };
 template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
 // Difference
 template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    return std::complex<T>(numext::real(a) - numext::real(b),
                           numext::imag(a) - numext::imag(b));
  }
 };
 template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T>>::HasMul
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    const T a_real = numext::real(a);
    const T a_imag = numext::imag(a);
    const T b_real = numext::real(b);
    const T b_imag = numext::imag(b);
    return std::complex<T>(a_real * b_real - a_imag * b_imag,
                           a_real * b_imag + a_imag * b_real);
  }
 };
 template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
    Vectorizable = packet_traits<std::complex<T>>::HasDiv
  };
  typedef typename std::complex<T> result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
    const T a_real = numext::real(a);
    const T a_imag = numext::imag(a);
    const T b_real = numext::real(b);
    const T b_imag = numext::imag(b);
    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
                           (a_imag * b_real - a_real * b_imag) * norm);
  }
 };
 template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
 #endif
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_COMPLEX_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
  return __half2half2(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }
@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }
 template<>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __ldg((const half2*)from);
 #else
@ -76,7 +76,7 @@ template<>
 }
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
  result.x = a.x & 0x7FFF7FFF;
  return result;
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+__device__ EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
  __half a1 = __low2half(kernel.packet[0]);
  __half a2 = __high2half(kernel.packet[0]);
@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
  kernel.packet[1] = __halves2half2(a2, b2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hadd2(a, b);
 #else
@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hsub2(a, b);
 #else
@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hneg2(a);
 #else
@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hmul2(a, b);
 #else
@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
   return __hfma2(a, b, c);
 #else
@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
  return __halves2half2(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
  return __halves2half2(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
 #endif
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
@ -277,29 +277,29 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2
 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<>  __device__ EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }
 #else
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -434,15 +434,16 @@ public:
  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
  {
    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
    // let gcc allocate the register in which to store the result of the pmul
    // (in the case where there is no FMA) gcc fails to figure out how to avoid
    // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
  }
@ -457,9 +458,6 @@ public:
    r = pmadd(c,alpha,r);
  }
 protected:
 //   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
 //   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };
 template<typename RealScalar, bool _ConjLhs>
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
  {
    typedef typename Dest::Scalar ResScalar;
    typedef typename Rhs::Scalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -14,12 +14,13 @@
  // 4512 - assignment operator could not be generated
  // 4522 - 'class' : multiple assignment operators specified
  // 4700 - uninitialized local variable 'xyz' used
  // 4714 - function marked as __forceinline not inlined
  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@ -67,6 +68,8 @@
  #pragma diag_suppress 2669
  #pragma diag_suppress 2670
  #pragma diag_suppress 2671
  #pragma diag_suppress 2735
  #pragma diag_suppress 2737
 #endif
 #endif // not EIGEN_WARNINGS_DISABLED
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -13,7 +13,7 @@
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 93
+#define EIGEN_MINOR_VERSION 94
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -954,8 +954,8 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) catch (X)
 #else
 #  ifdef __CUDA_ARCH__
-#    define EIGEN_THROW_X(X) asm("trap;") return {}
+#    define EIGEN_THROW_X(X) asm("trap;")
-#    define EIGEN_THROW asm("trap;"); return {}
+#    define EIGEN_THROW asm("trap;")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
    destruct_elements_of_array(ptr, i);
    EIGEN_THROW;
  }
  return NULL;
 }
 /*****************************************************************************
@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
    aligned_free(result);
    EIGEN_THROW;
  }
  return result;
 }
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
    conditional_aligned_free<Align>(result);
    EIGEN_THROW;
  }
  return result;
 }
 /** \internal Deletes objects constructed with aligned_new
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -671,6 +671,14 @@ struct scalar_div_cost {
  enum { value = 8*NumTraits<T>::MulCost };
 };
 template<typename T,bool Vectorized>
 struct scalar_div_cost<std::complex<T>, Vectorized> {
  enum { value = 2*scalar_div_cost<T>::value
               + 6*NumTraits<T>::MulCost
               + 3*NumTraits<T>::AddCost
  };
 };
 template<bool Vectorized>
 struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 /** Set \c *this from a \b unit quaternion.
-  * The resulting axis is normalized.
+  *
  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
  * 
  * This function implicitly normalizes the quaternion \a q.
  */
@ -167,12 +168,16 @@ template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
  using std::atan2;
  using std::abs;
  Scalar n = q.vec().norm();
  if(n<NumTraits<Scalar>::epsilon())
    n = q.vec().stableNorm();
-  if (n > Scalar(0))
+
  if (n != Scalar(0))
  {
-    m_angle = Scalar(2)*atan2(n, q.w());
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
    if(q.w() < 0)
      n = -n;
    m_axis  = q.vec() / n;
  }
  else
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
    res[0] = atan2(coeff(j,i), coeff(k,i));
    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
    {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
        res[0] -= Scalar(EIGEN_PI);
      }
      else {
        res[0] += Scalar(EIGEN_PI);
      }
      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
      res[1] = -atan2(s2, coeff(i,i));
    }
@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
    res[0] = atan2(coeff(j,k), coeff(k,k));
    Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
        res[0] -= Scalar(EIGEN_PI);
      }
      else {
        res[0] += Scalar(EIGEN_PI);
      }
      res[1] = atan2(-coeff(i,k), -c2);
    }
    else
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
  }
 };
 // TODO: the following specialization is to address a regression from 3.2 to 3.3
 // In the future, this path should be optimized.
 template<typename Lhs, typename RhsArg, int ProductTag>
 struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
 {
  template<typename Dest>
  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
  {
    dst.noalias() = lhs * rhs.eval();
  }
 };
 template<typename Lhs,typename Rhs>
 struct homogeneous_left_product_refactoring_helper
 {
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@ -464,7 +464,7 @@ public:
    operator * (const DiagonalBase<DiagonalDerived> &b) const
  {
    TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
    return res;
  }
@ -578,7 +578,7 @@ public:
    return res;
  }
-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }
  template<typename Derived>
  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
@ -853,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
 {
  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  affine().noalias() = (other.asDiagonal() * affine());
  return *this;
 }
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@ -130,8 +130,10 @@ public:
  }
  /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
+  template<typename Derived>
-  { return m_coeffs + other; }
+  inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
  operator* (const MatrixBase<Derived>& vec) const
  { return m_coeffs + vec.derived(); }
  /** \returns the inverse translation (opposite) */
  Translation inverse() const { return Translation(-m_coeffs); }
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
  {
    *this *= Scalar(1)-tau;
  }
-  else
+  else if(tau!=Scalar(0))
  {
    Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
    Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
  {
    *this *= Scalar(1)-tau;
  }
-  else
+  else if(tau!=Scalar(0))
  {
    Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
    Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -156,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
      *
      * \sa permutationQ()
      */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
    {
      eigen_assert(m_isInitialized && "LU is not initialized.");
      return m_p;
@ -406,8 +406,8 @@ template<typename _MatrixType> class FullPivLU
    MatrixType reconstructedMatrix() const;
-    inline Index rows() const { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -665,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
  // only worsening the precision of U and V as we accumulate more rotations
  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  // FIXME What about considerering any denormal numbers as zero, using:
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
  // Scaling factor to reduce over/under-flows
  RealScalar scale = matrix.cwiseAbs().maxCoeff();
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@ -141,6 +141,15 @@ template<typename Derived> class SparseMatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 #define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
 #else
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@ -149,8 +158,10 @@ template<typename Derived> class SparseMatrixBase
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
 #   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_UNARY_ADDONS
 #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
    /** \returns the number of rows. \sa cols() */
    inline Index rows() const { return derived().rows(); }
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@ -36,8 +36,10 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati
 #endif // not EIGEN_PARSED_BY_DOXYGEN
-/** \returns an expression of the opposite of \c *this
+/// \returns an expression of the opposite of \c *this
-  */
+///
 EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
 ///
 EIGEN_DEVICE_FUNC
 inline const NegativeReturnType
 operator-() const { return NegativeReturnType(derived()); }
@ -45,13 +47,15 @@ operator-() const { return NegativeReturnType(derived()); }
 template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
-/** \returns an expression of *this with the \a Scalar type casted to
+/// \returns an expression of \c *this with the \a Scalar type casted to
-  * \a NewScalar.
+/// \a NewScalar.
-  *
+///
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
+///
-  * \sa class CwiseUnaryOp
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
-  */
+///
 /// \sa class CwiseUnaryOp
 ///
 template<typename NewType>
 EIGEN_DEVICE_FUNC
 typename CastXpr<NewType>::Type
@ -60,9 +64,11 @@ cast() const
  return typename CastXpr<NewType>::Type(derived());
 }
-/** \returns an expression of the complex conjugate of \c *this.
+/// \returns an expression of the complex conjugate of \c *this.
-  *
+///
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
 ///
 /// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
@ -70,39 +76,45 @@ conjugate() const
  return ConjugateReturnType(derived());
 }
-/** \returns a read-only expression of the real part of \c *this.
+/// \returns a read-only expression of the real part of \c *this.
-  *
+///
-  * \sa imag() */
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
 ///
 /// \sa imag()
 EIGEN_DEVICE_FUNC
 inline RealReturnType
 real() const { return RealReturnType(derived()); }
-/** \returns an read-only expression of the imaginary part of \c *this.
+/// \returns an read-only expression of the imaginary part of \c *this.
-  *
+///
-  * \sa real() */
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
 ///
 /// \sa real()
 EIGEN_DEVICE_FUNC
 inline const ImagReturnType
 imag() const { return ImagReturnType(derived()); }
-/** \brief Apply a unary operator coefficient-wise
+/// \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
+/// \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
+/// \tparam  CustomUnaryOp Type of \a func
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
+///
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
+///
-  * Example:
+/// Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
+/// \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
+///
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
+///
-  * Example:
+/// Example:
-  * \include class_CwiseUnaryOp.cpp
+/// \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
+/// Output: \verbinclude class_CwiseUnaryOp.out
-  *
+///
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
-  */
+///
 /// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
 ///
 template<typename CustomUnaryOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@ -111,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
  return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
+///
-  * The template parameter \a CustomUnaryOp is the type of the functor
+/// The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
+/// of the custom unary operator.
-  *
+///
-  * Example:
+/// Example:
-  * \include class_CwiseUnaryOp.cpp
+/// \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
+/// Output: \verbinclude class_CwiseUnaryOp.out
-  *
+///
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
-  */
+///
 /// \sa unaryExpr, binaryExpr class CwiseUnaryOp
 ///
 template<typename CustomViewOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
@ -130,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
  return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }
-/** \returns a non const expression of the real part of \c *this.
+/// \returns a non const expression of the real part of \c *this.
-  *
+///
-  * \sa imag() */
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
 ///
 /// \sa imag()
 EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
 real() { return NonConstRealReturnType(derived()); }
-/** \returns a non const expression of the imaginary part of \c *this.
+/// \returns a non const expression of the imaginary part of \c *this.
-  *
+///
-  * \sa real() */
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
 ///
 /// \sa real()
 EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
 imag() { return NonConstImagReturnType(derived()); }
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@ -11,63 +11,75 @@
 // This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
 // This include MatrixBase and SparseMatrixBase.
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
-/** \returns an expression of the coefficient-wise absolute value of \c *this
+/// \returns an expression of the coefficient-wise absolute value of \c *this
-  *
+///
-  * Example: \include MatrixBase_cwiseAbs.cpp
+/// Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
+///
-  * \sa cwiseAbs2()
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
-  */
+///
 /// \sa cwiseAbs2()
 ///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbsReturnType
 cwiseAbs() const { return CwiseAbsReturnType(derived()); }
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
+///
-  * Example: \include MatrixBase_cwiseAbs2.cpp
+/// Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
+///
-  * \sa cwiseAbs()
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
-  */
+///
 /// \sa cwiseAbs()
 ///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
 cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
-/** \returns an expression of the coefficient-wise square root of *this.
+/// \returns an expression of the coefficient-wise square root of *this.
-  *
+///
-  * Example: \include MatrixBase_cwiseSqrt.cpp
+/// Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
+///
-  * \sa cwisePow(), cwiseSquare()
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
-  */
+///
 /// \sa cwisePow(), cwiseSquare()
 ///
 EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
-/** \returns an expression of the coefficient-wise signum of *this.
+/// \returns an expression of the coefficient-wise signum of *this.
-  *
+///
-  * Example: \include MatrixBase_cwiseSign.cpp
+/// Example: \include MatrixBase_cwiseSign.cpp
-  * Output: \verbinclude MatrixBase_cwiseSign.out
+/// Output: \verbinclude MatrixBase_cwiseSign.out
-  *
+///
-  */
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
 ///
 EIGEN_DEVICE_FUNC
 inline const CwiseSignReturnType
 cwiseSign() const { return CwiseSignReturnType(derived()); }
-/** \returns an expression of the coefficient-wise inverse of *this.
+/// \returns an expression of the coefficient-wise inverse of *this.
-  *
+///
-  * Example: \include MatrixBase_cwiseInverse.cpp
+/// Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
+///
-  * \sa cwiseProduct()
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
-  */
+///
 /// \sa cwiseProduct()
 ///
 EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@ -1,10 +1,13 @@
 find_package(BLAZE)
-find_package(Boost)
+find_package(Boost COMPONENTS system)
 if (BLAZE_FOUND AND Boost_FOUND)
  include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
  btl_add_bench(btl_blaze main.cpp)
  # Note: The newest blaze version requires C++14.
  # Ideally, we should set this depending on the version of Blaze we found
  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
  if(BUILD_btl_blaze)
-    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
  endif()
 endif ()
--- a/doc/CustomizingEigen_NullaryExpr.dox
+++ b/doc/CustomizingEigen_NullaryExpr.dox
@ -53,6 +53,33 @@ showing that the program works as expected:
 This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
 \section NullaryExpr_Indexing Example 2: indexing rows and columns
 The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
 \snippet nullary_indexing.out main1
 To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
 \snippet nullary_indexing.cpp functor
 Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
 \snippet nullary_indexing.cpp function
 Finally, here is an example of how this function can be used:
 \snippet nullary_indexing.cpp main1
 This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
 \snippet nullary_indexing.cpp main2
 and the output is:
 \snippet nullary_indexing.out main2
 */
 }
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -1612,7 +1612,10 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                         EIGEN_EMPTY \
                         EIGEN_EULER_ANGLES_TYPEDEFS \
                         EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
-                         EIGEN_EULER_SYSTEM_TYPEDEF
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
                         EIGEN_DOC_UNARY_ADDONS \
                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
  )
  add_dependencies(all_examples ${example})
 endforeach(example_src)
 check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
 if(EIGEN_COMPILER_SUPPORT_CPP11)
 ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
 endif()
--- a/doc/examples/make_circulant2.cpp
+++ b/doc/examples/make_circulant2.cpp
@ -0,0 +1,52 @@
 #include <Eigen/Core>
 #include <iostream>
 using namespace Eigen;
 // [circulant_func]
 template<class ArgType>
 class circulant_functor {
  const ArgType &m_vec;
 public:
  circulant_functor(const ArgType& arg) : m_vec(arg) {}
  const typename ArgType::Scalar& operator() (Index row, Index col) const {
    Index index = row - col;
    if (index < 0) index += m_vec.size();
    return m_vec(index);
  }
 };
 // [circulant_func]
 // [square]
 template<class ArgType>
 struct circulant_helper {
  typedef Matrix<typename ArgType::Scalar,
                 ArgType::SizeAtCompileTime,
                 ArgType::SizeAtCompileTime,
                 ColMajor,
                 ArgType::MaxSizeAtCompileTime,
                 ArgType::MaxSizeAtCompileTime> MatrixType;
 };
 // [square]
 // [makeCirculant]
 template <class ArgType>
 CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
 makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
 {
  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
 }
 // [makeCirculant]
 // [main]
 int main()
 {
  Eigen::VectorXd vec(4);
  vec << 1, 2, 4, 8;
  Eigen::MatrixXd mat;
  mat = makeCirculant(vec);
  std::cout << mat << std::endl;
 }
 // [main]
--- a/doc/examples/nullary_indexing.cpp
+++ b/doc/examples/nullary_indexing.cpp
@ -0,0 +1,66 @@
 #include <Eigen/Core>
 #include <iostream>
 using namespace Eigen;
 // [functor]
 template<class ArgType, class RowIndexType, class ColIndexType>
 class indexing_functor {
  const ArgType &m_arg;
  const RowIndexType &m_rowIndices;
  const ColIndexType &m_colIndices;
 public:
  typedef Matrix<typename ArgType::Scalar,
                 RowIndexType::SizeAtCompileTime,
                 ColIndexType::SizeAtCompileTime,
                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
                 RowIndexType::MaxSizeAtCompileTime,
                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
  {}
  const typename ArgType::Scalar& operator() (Index row, Index col) const {
    return m_arg(m_rowIndices[row], m_colIndices[col]);
  }
 };
 // [functor]
 // [function]
 template <class ArgType, class RowIndexType, class ColIndexType>
 CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
 indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
 {
  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
  typedef typename Func::MatrixType MatrixType;
  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
 }
 // [function]
 int main()
 {
  std::cout << "[main1]\n";
  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
  Array3i ri(1,2,1);
  ArrayXi ci(6); ci << 3,2,1,0,0,2;
  Eigen::MatrixXi B = indexing(A, ri, ci);
  std::cout << "A =" << std::endl;
  std::cout << A << std::endl << std::endl;
  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
  std::cout << B << std::endl;
  std::cout << "[main1]\n";
  std::cout << "[main2]\n";
  B =  indexing(A, ri+1, ci);
  std::cout << "A(ri+1,ci) =" << std::endl;
  std::cout << B << std::endl << std::endl;
 #if __cplusplus >= 201103L
  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
  std::cout << B << std::endl << std::endl;
 #endif
  std::cout << "[main2]\n";
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -355,7 +355,7 @@ if(CUDA_FOUND)
  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
  endif()
  if(EIGEN_TEST_CUDA_CLANG)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@ -417,6 +417,7 @@ void cholesky_faillure_cases()
    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
    VERIFY(ldlt.info()==NumericalIssue);
  }
 #if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
  {
    mat.resize(3,3);
    mat << -1, -3, 3,
@ -426,6 +427,7 @@ void cholesky_faillure_cases()
    VERIFY(ldlt.info()==NumericalIssue);
    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
  }
 #endif
  {
    mat.resize(3,3);
    mat <<  1, 2, 3,
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
    VERIFY( !m.allFinite() );
    VERIFY(  m.hasNaN() );
  }
-  m(4) /= T(0.0);
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
  m(4) /= hidden_zero;
  if(dryrun)
  {
    std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@ -111,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
  VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
  VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 void test_geo_homogeneous()
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@ -334,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
  t0.scale(v0);
  t1 *= AlignedScaling3(v0);
  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
  t1 = t1 * v0.asDiagonal();
  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
  // transformation * translation
  t0.translate(v0);
  t1 = t1 * Translation3(v0);
@ -482,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
    Rotation2D<Scalar> r2(r1);       // copy ctor
    VERIFY_IS_APPROX(r2.angle(),s0);
  }
  {
    Transform3 t32(Matrix4::Random()), t33, t34;
    t34 = t33 = t32;
    t32.scale(v0);
    t33*=AlignedScaling3(v0);
    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
    t33 = t34 * AlignedScaling3(v0);
    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
  }
 }
 template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
 void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
 {
  VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
  VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
  VERIFY_IS_APPROX( q*(p*h).hnormalized(),  ((q*p)*h).hnormalized() );
 }
 template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
 void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
 {
  VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
  VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
  VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
  transform_associativity_left(a1, a2,p, q, v, h);
 }
 template<typename Scalar, int Dim, int Options,typename RotationType>
 void transform_associativity(const RotationType& R)
 {
  typedef Matrix<Scalar,Dim,1> VectorType;
  typedef Matrix<Scalar,Dim+1,1> HVectorType;
  typedef Matrix<Scalar,Dim,Dim> LinearType;
  typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
  typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
  typedef Transform<Scalar,Dim,Affine,Options> AffineType;
  typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
  typedef DiagonalMatrix<Scalar,Dim> ScalingType;
  typedef Translation<Scalar,Dim> TranslationType;
  AffineCompactType A1c; A1c.matrix().setRandom();
  AffineCompactType A2c; A2c.matrix().setRandom();
  AffineType A1(A1c);
  AffineType A2(A2c);
  ProjectiveType P1; P1.matrix().setRandom();
  VectorType v1 = VectorType::Random();
  VectorType v2 = VectorType::Random();
  HVectorType h1 = HVectorType::Random();
  Scalar s1 = internal::random<Scalar>();
  LinearType L = LinearType::Random();
  MatrixType M = MatrixType::Random();
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
  CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
  VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
  VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
  VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
  VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
  VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
  VERIFY_IS_APPROX( M*(P1*h1),  ((M*P1)*h1) );
 }
 template<typename Scalar> void transform_alignment()
@ -562,5 +638,8 @@ void test_geo_transformations()
    CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
    CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
    CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
    CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
  }
 }
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
  }
  if (PacketTraits::HasTanh) {
    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
    packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
    h.store(data2, internal::ptanh(h.load(data1)));
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@ -256,6 +256,51 @@ Index compute_block_size()
  return ret;
 }
 template<int>
 void bug_1308()
 {
  int n = 10;
  MatrixXd r(n,n);
  VectorXd v = VectorXd::Random(n);
  r = v * RowVectorXd::Ones(n);
  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
  r = VectorXd::Ones(n) * v.transpose();
  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
  Matrix4d ones44 = Matrix4d::Ones();
  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
 //   RowVector4d r4;
  m44.setOnes();
  r44.setZero();
  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
  r44.setZero();
  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
  r44.setZero();
  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
  r44.setZero();
  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
 }
 void test_product_extra()
 {
  for(int i = 0; i < g_repeat; i++) {
@ -268,8 +313,10 @@ void test_product_extra()
  }
  CALL_SUBTEST_5( bug_127<0>() );
  CALL_SUBTEST_5( bug_817<0>() );
  CALL_SUBTEST_5( bug_1308<0>() );
  CALL_SUBTEST_6( unaligned_objects<0>() );
  CALL_SUBTEST_7( compute_block_size<float>() );
  CALL_SUBTEST_7( compute_block_size<double>() );
  CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
 }
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@ -12,6 +12,7 @@
 #include <Eigen/LU>
 // regression test for bug 447
 template<int>
 void product1x1()
 {
  Matrix<float,1,3> matAstatic;
@ -209,15 +210,34 @@ void test_linear_but_not_vectorizable()
  }
 }
 template<int Rows>
 void bug_1311()
 {
  Matrix< double, Rows, 2 > A;  A.setRandom();
  Vector2d b = Vector2d::Random() ;
  Matrix<double,Rows,1> res;
  res.noalias() = 1. * (A * b);
  VERIFY_IS_APPROX(res, A*b);
  res.noalias() = 1.*A * b;
  VERIFY_IS_APPROX(res, A*b);
  res.noalias() = (1.*A).lazyProduct(b);
  VERIFY_IS_APPROX(res, A*b);
  res.noalias() = (1.*A).lazyProduct(1.*b);
  VERIFY_IS_APPROX(res, A*b);
  res.noalias() = (A).lazyProduct(1.*b);
  VERIFY_IS_APPROX(res, A*b);
 }
 void test_product_small()
 {
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
    CALL_SUBTEST_3( product(Matrix3d()) );
    CALL_SUBTEST_4( product(Matrix4d()) );
    CALL_SUBTEST_5( product(Matrix4f()) );
-    CALL_SUBTEST_6( product1x1() );
+    CALL_SUBTEST_6( product1x1<0>() );
    CALL_SUBTEST_11( test_lazy_l1<float>() );
    CALL_SUBTEST_12( test_lazy_l2<float>() );
@ -238,6 +258,9 @@ void test_product_small()
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
    CALL_SUBTEST_6( bug_1311<3>() );
    CALL_SUBTEST_6( bug_1311<5>() );
  }
 #ifdef EIGEN_TEST_PART_6
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@ -7,6 +7,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 template<typename T>
 Array<T,4,1> four_denorms();
 template<>
 Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
 template<>
 Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
 template<typename T>
 Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
 template<typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0)
 {
@ -55,7 +65,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
  }
  Matrix<Scalar,Dynamic,1> samples(9);
-  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
+  samples << 0, four_denorms<RealScalar>(),
            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
  if(Option==Symmetric)
  {
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -61,8 +61,9 @@ typedef unsigned __int64 uint64_t;
 #ifdef EIGEN_USE_GPU
 #include <iostream>
 #include <cuda_runtime.h>
-#if defined(__CUDACC__)
+#if __cplusplus >= 201103L
-#include <curand_kernel.h>
+#include <atomic>
 #include <unistd.h>
 #endif
 #endif
@ -81,6 +82,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorDimensions.h"
 #include "src/Tensor/TensorInitializer.h"
 #include "src/Tensor/TensorTraits.h"
 #include "src/Tensor/TensorRandom.h"
 #include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
 #include "src/Tensor/TensorGlobalFunctions.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@ -51,12 +51,15 @@ class TensorOpCost {
        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
  }
  EIGEN_DEVICE_FUNC
  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
  EIGEN_DEVICE_FUNC
  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
      : bytes_loaded_(bytes_loaded),
        bytes_stored_(bytes_stored),
        compute_cycles_(compute_cycles) {}
  EIGEN_DEVICE_FUNC
  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
               bool vectorized, double packet_size)
      : bytes_loaded_(bytes_loaded),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;
 static void initializeDeviceProp() {
  if (!m_devicePropInitialized) {
-    if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
    // calling this function simultaneously. This would be trivial to
    // implement if we could use std::mutex, but unfortunately mutex don't
    // compile with nvcc, so we resort to atomics and thread fences instead.
    // Note that if the caller uses a compiler that doesn't support c++11 we
    // can't ensure that the initialization is thread safe.
 #if __cplusplus >= 201103L
    static std::atomic<bool> first(true);
    if (first.exchange(false)) {
 #else
    static bool first = true;
    if (first) {
      first = false;
 #endif
      // We're the first thread to reach this point.
      int num_devices;
      cudaError_t status = cudaGetDeviceCount(&num_devices);
      if (status != cudaSuccess) {
@ -63,7 +77,19 @@ static void initializeDeviceProp() {
          assert(status == cudaSuccess);
        }
      }
 #if __cplusplus >= 201103L
      std::atomic_thread_fence(std::memory_order_release);
 #endif
      m_devicePropInitialized = true;
    } else {
      // Wait for the other thread to inititialize the properties.
      while (!m_devicePropInitialized) {
 #if __cplusplus >= 201103L
        std::atomic_thread_fence(std::memory_order_acquire);
 #endif
        sleep(1);
      }
    }
  }
 }
@ -168,39 +194,20 @@ struct GpuDevice {
    return stream_->stream();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
 #ifndef __CUDA_ARCH__
    return stream_->allocate(num_bytes);
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return NULL;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
    stream_->deallocate(buffer);
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
+  EIGEN_STRONG_INLINE void* scratchpad() const {
 #ifndef __CUDA_ARCH__
    return stream_->scratchpad();
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return NULL;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
 #ifndef __CUDA_ARCH__
    return stream_->semaphore();
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return NULL;
 #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@ -210,30 +217,22 @@ struct GpuDevice {
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
    cudaError_t err =
        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
    cudaError_t err =
        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@ -242,21 +241,21 @@ struct GpuDevice {
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE size_t numThreads() const {
    // FIXME
    return 32;
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
    // FIXME
    return 48*1024;
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
    // We won't try to take advantage of the l2 cache for the time being, and
    // there is no l3 cache on cuda devices.
    return firstLevelCacheSize();
@ -276,56 +275,26 @@ struct GpuDevice {
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().multiProcessorCount;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerBlock;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().sharedMemPerBlock;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().major;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
    return stream_->deviceProperties().minor;
 #else
    eigen_assert(false && "The default device should be used instead to generate kernel code");
    return 0;
 #endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+  EIGEN_STRONG_INLINE int maxBlocks() const {
    return max_blocks_;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
  const Index step_size = blockDim.x * gridDim.x;
  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
  // complex types such as evaluators we should really conform to the C++
  // standard and call a proper copy constructor.
  Evaluator eval(memcopied_eval);
  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -99,7 +99,8 @@ template <typename T> struct SumReducer
  static const bool IsStateful = false;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
    *accum = sum_op(*accum, t);
  }
  template <typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
  MeanReducer() : scalarCount_(0), packetCount_(0) { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
    *accum = sum_op(*accum, t);
    scalarCount_++;
  }
  template <typename Packet>
@ -190,25 +192,25 @@ struct reducer_traits<MeanReducer<T>, Device> {
 template <typename T, bool IsMax = true, bool IsInteger = true>
 struct MinMaxBottomValue {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::lowest();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, true, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return -Eigen::NumTraits<T>::infinity();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::highest();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::infinity();
  }
 };
@ -439,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
 };
 // Random number generation
 namespace {
 #ifdef __CUDA_ARCH__
 __device__ int get_random_seed() {
    return clock();
 }
 #else
 static inline int get_random_seed() {
 #ifdef _WIN32
    SYSTEMTIME st;
    GetSystemTime(&st);
    return st.wSecond + 1000 * st.wMilliseconds;
 #elif defined __APPLE__
    return static_cast<int>(mach_absolute_time());
 #else
    timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return static_cast<int>(ts.tv_nsec);
 #endif
 }
 #endif
 }
 #if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
 // We're not compiling a cuda kernel
 template <typename T> class UniformRandomGenerator {
 public:
  static const bool PacketAccess = true;
  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    if (!deterministic) {
      srand(get_random_seed());
    }
  }
  UniformRandomGenerator(const UniformRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
  }
  T operator()() const {
    return random<T>();
  }
  template<typename PacketType>
  PacketType packetOp() const {
    const int packetSize = internal::unpacket_traits<PacketType>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    for (int i = 0; i < packetSize; ++i) {
      values[i] = random<T>();
    }
    return internal::pload<PacketType>(values);
  }
 private:
  bool m_deterministic;
 };
 #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 template <> class UniformRandomGenerator<float> {
 public:
  static const bool PacketAccess = true;
  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
    if (!deterministic) {
      m_generator->seed(get_random_seed());
    }
  }
  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
    m_generator = new std::mt19937();
    m_generator->seed(other() * UINT_MAX);
    m_deterministic = other.m_deterministic;
  }
  ~UniformRandomGenerator() {
    delete m_generator;
  }
  float operator()() const {
    return m_distribution(*m_generator);
  }
  template<typename PacketType>
  PacketType packetOp() const {
    const int packetSize = internal::unpacket_traits<PacketType>::size;
    EIGEN_ALIGN_MAX float values[packetSize];
    for (int k = 0; k < packetSize; ++k) {
      values[k] = this->operator()();
    }
    return internal::pload<PacketType>(values);
  }
 private:
  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
  // Make sure m_deterministic comes first to match the layout of the cpu
  // version of the code.
  bool m_deterministic;
  std::mt19937* m_generator;
  mutable std::uniform_real_distribution<float> m_distribution;
 };
 template <> class UniformRandomGenerator<double> {
 public:
  static const bool PacketAccess = true;
  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
    if (!deterministic) {
      m_generator->seed(get_random_seed());
    }
  }
  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
    m_generator = new std::mt19937();
    m_generator->seed(other() * UINT_MAX);
    m_deterministic = other.m_deterministic;
  }
  ~UniformRandomGenerator() {
    delete m_generator;
  }
  double operator()() const {
    return m_distribution(*m_generator);
  }
  template<typename PacketType>
  PacketType packetOp() const {
    const int packetSize = internal::unpacket_traits<PacketType>::size;
    EIGEN_ALIGN_MAX double values[packetSize];
    for (int k = 0; k < packetSize; ++k) {
      values[k] = this->operator()();
    }
    return internal::pload<PacketType>(values);
  }
 private:
  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
  // Make sure m_deterministic comes first to match the layout of the cpu
  // version of the code.
  bool m_deterministic;
  std::mt19937* m_generator;
  mutable std::uniform_real_distribution<double> m_distribution;
 };
 #endif
 #else
 // We're compiling a cuda kernel
 template <typename T> class UniformRandomGenerator;
 template <> class UniformRandomGenerator<float> {
 public:
  static const bool PacketAccess = true;
  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
  }
  __device__ float operator()() const {
    return curand_uniform(&m_state);
  }
  template<typename PacketType>
  __device__ float4 packetOp() const {
    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return curand_uniform4(&m_state);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class UniformRandomGenerator<double> {
 public:
  static const bool PacketAccess = true;
  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ double operator()() const {
    return curand_uniform_double(&m_state);
  }
  template<typename PacketType>
  __device__ double2 packetOp() const {
    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return curand_uniform2_double(&m_state);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class UniformRandomGenerator<std::complex<float> > {
 public:
  static const bool PacketAccess = false;
  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ std::complex<float> operator()() const {
    float4 vals = curand_uniform4(&m_state);
    return std::complex<float>(vals.x, vals.y);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class UniformRandomGenerator<std::complex<double> > {
 public:
  static const bool PacketAccess = false;
  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ std::complex<double> operator()() const {
    double2 vals = curand_uniform2_double(&m_state);
    return std::complex<double>(vals.x, vals.y);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 #endif
 template <typename Scalar>
 struct functor_traits<UniformRandomGenerator<Scalar> > {
  enum {
    // Rough estimate.
    Cost = 100 * NumTraits<Scalar>::MulCost,
    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
  };
 };
 #if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
 // We're not compiling a cuda kernel
 template <typename T> class NormalRandomGenerator {
 public:
  static const bool PacketAccess = true;
  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
    if (!deterministic) {
      m_generator->seed(get_random_seed());
    }
  }
  NormalRandomGenerator(const NormalRandomGenerator& other)
      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
    m_generator->seed(other() * UINT_MAX);
  }
  ~NormalRandomGenerator() {
    delete m_generator;
  }
  T operator()() const {
    return m_distribution(*m_generator);
  }
  template<typename PacketType>
  PacketType packetOp() const {
    const int packetSize = internal::unpacket_traits<PacketType>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    for (int i = 0; i < packetSize; ++i) {
      values[i] = m_distribution(*m_generator);
    }
    return internal::pload<PacketType>(values);
  }
 private:
  // No assignment
  NormalRandomGenerator& operator = (const NormalRandomGenerator&);
  bool m_deterministic;
  mutable std::normal_distribution<T> m_distribution;
  std::mt19937* m_generator;
 };
 #elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
 // We're compiling a cuda kernel
 template <typename T> class NormalRandomGenerator;
 template <> class NormalRandomGenerator<float> {
 public:
  static const bool PacketAccess = true;
  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ float operator()() const {
    return curand_normal(&m_state);
  }
  template<typename PacketType>
   __device__ float4 packetOp() const {
    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return curand_normal4(&m_state);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class NormalRandomGenerator<double> {
 public:
  static const bool PacketAccess = true;
  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ double operator()() const {
    return curand_normal_double(&m_state);
  }
  template<typename PacketType>
  __device__ double2 packetOp() const {
    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
    return curand_normal2_double(&m_state);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class NormalRandomGenerator<std::complex<float> > {
 public:
  static const bool PacketAccess = false;
  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ std::complex<float> operator()() const {
    float4 vals = curand_normal4(&m_state);
    return std::complex<float>(vals.x, vals.y);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 template <> class NormalRandomGenerator<std::complex<double> > {
 public:
  static const bool PacketAccess = false;
  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
    m_deterministic = other.m_deterministic;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int seed = m_deterministic ? 0 : get_random_seed();
    curand_init(seed, tid, 0, &m_state);
  }
  __device__ std::complex<double> operator()() const {
    double2 vals = curand_normal2_double(&m_state);
    return std::complex<double>(vals.x, vals.y);
  }
 private:
  bool m_deterministic;
  mutable curandStatePhilox4_32_10_t m_state;
 };
 #else
 template <typename T> class NormalRandomGenerator {
 public:
  static const bool PacketAccess = false;
  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
 private:
  bool m_deterministic;
 };
 #endif
 template <typename Scalar>
 struct functor_traits<NormalRandomGenerator<Scalar> > {
  enum {
    // Rough estimate.
    Cost = 100 * NumTraits<Scalar>::MulCost,
    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
  };
 };
 template <typename T, typename Index, size_t NumDims>
 class GaussianGenerator {
 public:
@ -895,7 +455,7 @@ class GaussianGenerator {
    }
  }
-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
    T tmp = T(0);
    for (size_t i = 0; i < NumDims; ++i) {
      T offset = coordinates[i] - m_means[i];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@ -0,0 +1,276 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
 namespace Eigen {
 namespace internal {
 namespace {
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
 #ifdef __CUDA_ARCH__
  // We don't support 3d kernels since we currently only use 1 and
  // 2d kernels.
  assert(threadIdx.z == 0);
  return clock64() +
      blockIdx.x * blockDim.x + threadIdx.x +
      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #elif defined _WIN32
  // Use the current time as a baseline.
  GetSystemTime(&st);
  int time = st.wSecond + 1000 * st.wMilliseconds;
  // Mix in a random number to make sure that we get different seeds if
  // we try to generate seeds faster than the clock resolution.
  // We need 2 random values since the generator only generate 16 bits at
  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
  SYSTEMTIME st;
  uint rnd1 = ::rand();
  uint rnd2 = ::rand();
  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
  return rnd;
 #elif defined __APPLE__
  // Same approach as for win32, except that the random number generator
  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
  uint64_t rnd = ::random() ^ mach_absolute_time();
  return rnd;
 #else
  // Augment the current time with pseudo random number generation
  // to ensure that we get different seeds if we try to generate seeds
  // faster than the clock resolution.
  timespec ts;
  clock_gettime(CLOCK_REALTIME, &ts);
  uint64_t rnd = ::random() ^ ts.tv_nsec;
  return rnd;
 #endif
 }
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
  // TODO: Unify with the implementation in the non blocking thread pool.
  uint64_t current = *state;
  // Update the internal state
  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
  // Generate the random output (using the PCG-XSH-RS scheme)
  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
  seed = seed ? seed : get_random_seed();
  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
 }
 }  // namespace
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 T RandomToTypeUniform(uint64_t* state) {
  unsigned rnd = PCG_XSH_RS_generator(state);
  return static_cast<T>(rnd);
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
  Eigen::half result;
  // Generate 10 random bits for the mantissa
  unsigned rnd = PCG_XSH_RS_generator(state);
  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
  // Set the exponent
  result.x |= (static_cast<uint16_t>(15) << 10);
  // Return the final result
  return result - Eigen::half(1.0f);
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float RandomToTypeUniform<float>(uint64_t* state) {
  typedef union {
    uint32_t raw;
    float fp;
  } internal;
  internal result;
  // Generate 23 random bits for the mantissa mantissa
  const unsigned rnd = PCG_XSH_RS_generator(state);
  result.raw = rnd & 0x7fffffu;
  // Set the exponent
  result.raw |= (static_cast<uint32_t>(127) << 23);
  // Return the final result
  return result.fp - 1.0f;
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 double RandomToTypeUniform<double>(uint64_t* state) {
  typedef union {
    uint64_t raw;
    double dp;
  } internal;
  internal result;
  result.raw = 0;
  // Generate 52 random bits for the mantissa
  // First generate the upper 20 bits
  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
  // The generate the lower 32 bits
  unsigned rnd2 = PCG_XSH_RS_generator(state);
  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
  // Set the exponent
  result.raw |= (static_cast<uint64_t>(1023) << 52);
  // Return the final result
  return result.dp - 1.0;
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
  return std::complex<float>(RandomToTypeUniform<float>(state),
                             RandomToTypeUniform<float>(state));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
  return std::complex<double>(RandomToTypeUniform<double>(state),
                              RandomToTypeUniform<double>(state));
 }
 template <typename T> class UniformRandomGenerator {
 public:
  static const bool PacketAccess = true;
  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      const UniformRandomGenerator& other) {
    m_state = other.m_state;
  }
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    uint64_t local_state = m_state + i;
    T result = RandomToTypeUniform<T>(&local_state);
    m_state = local_state;
    return result;
  }
  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    uint64_t local_state = m_state + i;
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeUniform<T>(&local_state);
    }
    m_state = local_state;
    return internal::pload<Packet>(values);
  }
 private:
  mutable uint64_t m_state;
 };
 template <typename Scalar>
 struct functor_traits<UniformRandomGenerator<Scalar> > {
  enum {
    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
    Cost = 12 * NumTraits<Scalar>::AddCost *
           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
  };
 };
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 T RandomToTypeNormal(uint64_t* state) {
  // Use the ratio of uniform method to generate numbers following a normal
  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
  // details.
  T u, v, q;
  do {
    u = RandomToTypeUniform<T>(state);
    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
    const T x = u - T(0.449871);
    const T y = numext::abs(v) + T(0.386595);
    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
  } while (q > T(0.27597) &&
           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
  return v/u;
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
  return std::complex<float>(RandomToTypeNormal<float>(state),
                             RandomToTypeNormal<float>(state));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
  return std::complex<double>(RandomToTypeNormal<double>(state),
                              RandomToTypeNormal<double>(state));
 }
 template <typename T> class NormalRandomGenerator {
 public:
  static const bool PacketAccess = true;
  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
      const NormalRandomGenerator& other) {
    m_state = other.m_state;
  }
 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    uint64_t local_state = m_state + i;
    T result = RandomToTypeNormal<T>(&local_state);
    m_state = local_state;
    return result;
  }
  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    uint64_t local_state = m_state + i;
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeNormal<T>(&local_state);
    }
    m_state = local_state;
    return internal::pload<Packet>(values);
  }
 private:
  mutable uint64_t m_state;
 };
 template <typename Scalar>
 struct functor_traits<NormalRandomGenerator<Scalar> > {
  enum {
    // On average, we need to generate about 3 random numbers
    // 15 mul, 8 add, 1.5 logs
    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
  };
 };
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@ -20,37 +20,60 @@ public:
  AutoDiffJacobian(const Functor& f) : Functor(f) {}
  // forward constructors
 #if EIGEN_HAS_VARIADIC_TEMPLATES
  template<typename... T>
  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
 #else
  template<typename T0>
  AutoDiffJacobian(const T0& a0) : Functor(a0) {}
  template<typename T0, typename T1>
  AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
  template<typename T0, typename T1, typename T2>
  AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
-
+#endif
  enum {
    InputsAtCompileTime = Functor::InputsAtCompileTime,
    ValuesAtCompileTime = Functor::ValuesAtCompileTime
  };
  typedef typename Functor::InputType InputType;
  typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
+  typedef typename ValueType::Scalar Scalar;
-  typedef typename JacobianType::Scalar Scalar;
+
  enum {
    InputsAtCompileTime = InputType::RowsAtCompileTime,
    ValuesAtCompileTime = ValueType::RowsAtCompileTime
  };
  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
  typedef typename JacobianType::Index Index;
-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
  typedef AutoDiffScalar<DerivativeType> ActiveScalar;
  typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
  typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 #if EIGEN_HAS_VARIADIC_TEMPLATES
  // Some compilers don't accept variadic parameters after a default parameter,
  // i.e., we can't just write _jac=0 but we need to overload operator():
  EIGEN_STRONG_INLINE
  void operator() (const InputType& x, ValueType* v) const
  {
      this->operator()(x, v, 0);
  }
  template<typename... ParamsType>
  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
                   const ParamsType&... Params) const
 #else
  void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
 #endif
  {
    eigen_assert(v!=0);
    if (!_jac)
    {
 #if EIGEN_HAS_VARIADIC_TEMPLATES
      Functor::operator()(x, v, Params...);
 #else
      Functor::operator()(x, v);
 #endif
      return;
    }
@ -61,12 +84,16 @@ public:
    if(InputsAtCompileTime==Dynamic)
      for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+        av[j].derivatives().resize(x.rows());
    for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
 #if EIGEN_HAS_VARIADIC_TEMPLATES
    Functor::operator()(ax, &av, Params...);
 #else
    Functor::operator()(ax, &av);
 #endif
    for (Index i=0; i<jac.rows(); i++)
    {
@ -74,8 +101,6 @@ public:
      jac.row(i) = av[i].derivatives();
    }
  }
 protected:
 };
 }
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@ -189,7 +189,12 @@ namespace Eigen
      res[0] = atan2(mat(J,K), mat(K,K));
      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
          res[0] -= Scalar(EIGEN_PI);
        }
        else {
          res[0] += Scalar(EIGEN_PI);
        }
        res[1] = atan2(-mat(I,K), -c2);
      }
      else
@ -212,7 +217,12 @@ namespace Eigen
      res[0] = atan2(mat(J,I), mat(K,I));
      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
      {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
          res[0] -= Scalar(EIGEN_PI);
        }
        else {
          res[0] += Scalar(EIGEN_PI);
        }
        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
        res[1] = -atan2(s2, mat(I,I));
      }
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -203,7 +203,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
  endif()
  if(EIGEN_TEST_CUDA_CLANG)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
  ei_add_test(cxx11_tensor_complex_cuda)
  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
  ei_add_test(cxx11_tensor_reduction_cuda)
  ei_add_test(cxx11_tensor_argmax_cuda)
  ei_add_test(cxx11_tensor_cast_float16_cuda)
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@ -105,6 +105,89 @@ struct TestFunc1
  }
 };
 #if EIGEN_HAS_VARIADIC_TEMPLATES
 /* Test functor for the C++11 features. */
 template <typename Scalar>
 struct integratorFunctor
 {
    typedef Matrix<Scalar, 2, 1> InputType;
    typedef Matrix<Scalar, 2, 1> ValueType;
    /*
     * Implementation starts here.
     */
    integratorFunctor(const Scalar gain) : _gain(gain) {}
    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
    const Scalar _gain;
    template <typename T1, typename T2>
    void operator() (const T1 &input, T2 *output, const Scalar dt) const
    {
        T2 &o = *output;
        /* Integrator to test the AD. */
        o[0] = input[0] + input[1] * dt * _gain;
        o[1] = input[1] * _gain;
    }
    /* Only needed for the test */
    template <typename T1, typename T2, typename T3>
    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
    {
        T2 &o = *output;
        /* Integrator to test the AD. */
        o[0] = input[0] + input[1] * dt * _gain;
        o[1] = input[1] * _gain;
        if (jacobian)
        {
            T3 &j = *jacobian;
            j(0, 0) = 1;
            j(0, 1) = dt * _gain;
            j(1, 0) = 0;
            j(1, 1) = _gain;
        }
    }
 };
 template<typename Func> void forward_jacobian_cpp11(const Func& f)
 {
    typedef typename Func::ValueType::Scalar Scalar;
    typedef typename Func::ValueType ValueType;
    typedef typename Func::InputType InputType;
    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
    InputType x = InputType::Random(InputType::RowsAtCompileTime);
    ValueType y, yref;
    JacobianType j, jref;
    const Scalar dt = internal::random<double>();
    jref.setZero();
    yref.setZero();
    f(x, &yref, &jref, dt);
    //std::cerr << "y, yref, jref: " << "\n";
    //std::cerr << y.transpose() << "\n\n";
    //std::cerr << yref << "\n\n";
    //std::cerr << jref << "\n\n";
    AutoDiffJacobian<Func> autoj(f);
    autoj(x, &y, &j, dt);
    //std::cerr << "y j (via autodiff): " << "\n";
    //std::cerr << y.transpose() << "\n\n";
    //std::cerr << j << "\n\n";
    VERIFY_IS_APPROX(y, yref);
    VERIFY_IS_APPROX(j, jref);
 }
 #endif
 template<typename Func> void forward_jacobian(const Func& f)
 {
    typename Func::InputType x = Func::InputType::Random(f.inputs());
@ -128,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
    VERIFY_IS_APPROX(j, jref);
 }
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_scalar()
@ -141,6 +223,7 @@ void test_autodiff_scalar()
  VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
 }
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_vector()
@ -164,6 +247,9 @@ void test_autodiff_jacobian()
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
 #if EIGEN_HAS_VARIADIC_TEMPLATES
  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
 #endif
 }
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@ -71,8 +71,45 @@ void test_cuda_nullary() {
 }
 static void test_cuda_sum_reductions() {
  Eigen::CudaStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);
  const int num_rows = internal::random<int>(1024, 5*1024);
  const int num_cols = internal::random<int>(1024, 5*1024);
  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
  in.setRandom();
  Tensor<std::complex<float>, 0> full_redux;
  full_redux = in.sum();
  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
  out_gpu.device(gpu_device) = in_gpu.sum();
  Tensor<std::complex<float>, 0> full_redux_gpu;
  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
  gpu_device.synchronize();
  // Check that the CPU and GPU reductions return the same result.
  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
  gpu_device.deallocate(gpu_in_ptr);
  gpu_device.deallocate(gpu_out_ptr);
 }
 void test_cxx11_tensor_complex()
 {
  CALL_SUBTEST(test_cuda_nullary());
  CALL_SUBTEST(test_cuda_sum_reductions());
 }
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@ -0,0 +1,97 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
 #define EIGEN_USE_GPU
 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
 #include <cuda_fp16.h>
 #endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::Tensor;
 template<typename T>
 void test_cuda_complex_cwise_ops() {
  const int kNumItems = 2;
  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
  std::complex<T>* d_in1;
  std::complex<T>* d_in2;
  std::complex<T>* d_out;
  cudaMalloc((void**)(&d_in1), complex_bytes);
  cudaMalloc((void**)(&d_in2), complex_bytes);
  cudaMalloc((void**)(&d_out), complex_bytes);
  Eigen::CudaStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);
  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
      d_in1, kNumItems);
  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
      d_in2, kNumItems);
  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
      d_out, kNumItems);
  const std::complex<T> a(3.14f, 2.7f);
  const std::complex<T> b(-10.6f, 1.4f);
  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
  enum CwiseOp {
    Add = 0,
    Sub,
    Mul,
    Div
  };
  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
  for (int op = Add; op <= Div; op++) {
    std::complex<T> expected;
    switch (static_cast<CwiseOp>(op)) {
      case Add:
        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
        expected = a + b;
        break;
      case Sub:
        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
        expected = a - b;
        break;
      case Mul:
        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
        expected = a * b;
        break;
      case Div:
        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
        expected = a / b;
        break;
    }
    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
                           gpu_device.stream()) == cudaSuccess);
    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
    for (int i = 0; i < kNumItems; ++i) {
      VERIFY_IS_APPROX(actual(i), expected);
    }
  }
  cudaFree(d_in1);
  cudaFree(d_in2);
  cudaFree(d_out);
 }
 void test_cxx11_tensor_complex_cwise_ops()
 {
  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
 }