Pull the latest updates from trunk

2025-07-21 12:24:25 +08:00 · 2016-10-05 14:54:36 -07:00 · 2016-10-05 14:54:36 -07:00 · ae1385c7e4
commit ae1385c7e4
parent 73b0012945 ceee1c008b
59 changed files with 1930 additions and 1250 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -411,6 +411,10 @@ using std::ptrdiff_t;
 #include "src/Core/functors/StlFunctors.h"
 #include "src/Core/functors/AssignmentFunctors.h"

+// Specialized functors to enable the processing of complex numbers
+// on CUDA devices
+#include "src/Core/arch/CUDA/Complex.h"
+
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -87,6 +87,7 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
@ -97,6 +98,7 @@ template<typename Derived> class ArrayBase
 #     include EIGEN_ARRAYBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -817,73 +817,79 @@ struct mapbase_evaluator : evaluator_base<Derived>
    ColsAtCompileTime = XprType::ColsAtCompileTime,
    CoeffReadCost = NumTraits<Scalar>::ReadCost
  };
-  
+
  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
-    : m_data(const_cast<PointerType>(map.data())),  
-      m_xpr(map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
  {
    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
  }
- 
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
-    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+    return m_data[col * colStride() + row * rowStride()];
  }
-  
+
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  {
-    return m_data[index * m_xpr.innerStride()];
+    return m_data[index * m_innerStride.value()];
  }
- 
+
  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const 
+  PacketType packet(Index row, Index col) const
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::ploadt<PacketType, LoadMode>(ptr);
  }

  template<int LoadMode, typename PacketType>
  EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const 
+  PacketType packet(Index index) const
  {
-    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
  }
-  
+
  template<int StoreMode, typename PacketType>
  EIGEN_STRONG_INLINE
-  void writePacket(Index row, Index col, const PacketType& x) 
+  void writePacket(Index row, Index col, const PacketType& x)
  {
-    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
  }
-  
+
  template<int StoreMode, typename PacketType>
  EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketType& x) 
+  void writePacket(Index index, const PacketType& x)
  {
-    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
  }
- 
 protected:
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+
  PointerType m_data;
-  const XprType& m_xpr;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };

 template<typename PlainObjectType, int MapOptions, typename StrideType> 
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
  *
  * The function generates 'size' equally spaced values in the closed interval [low,high].
  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
+  * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
  * and yields faster code than the random access version.
  *
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
 /**
  * \brief Sets a linearly spaced vector.
  *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
+  * The function fills *this with equally spaced values in the closed interval [low,high].
  * When size is set to 1, a vector of length 1 containing 'high' is returned.
  *
  * \only_for_vectors
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -558,12 +558,15 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC void reverseInPlace();

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
 #   include "../plugins/BlockMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF

    // disable the use of evalTo for dense objects with a nice compilation error
    template<typename Dest>
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
  enum {
    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
    PacketSize      = internal::packet_traits<Scalar>::size
  };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
  
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;

    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@ -50,7 +50,7 @@ public:
  typedef typename internal::ref_selector<Inverse>::type Nested;
  typedef typename internal::remove_all<XprType>::type NestedExpression;
  
-  explicit Inverse(const XprType &xpr)
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
    : m_xpr(xpr)
  {}

--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -97,6 +97,19 @@ struct real_default_impl<Scalar,true>

 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};

+#ifdef __CUDA_ARCH__
+template<typename T>
+struct real_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.real();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct real_retval
 {
@ -132,6 +145,19 @@ struct imag_default_impl<Scalar,true>

 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};

+#ifdef __CUDA_ARCH__
+template<typename T>
+struct imag_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.imag();
+  }
+};
+#endif
+
 template<typename Scalar>
 struct imag_retval
 {
@ -1049,12 +1075,12 @@ double abs(const double &x) { return ::fabs(x); }

 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const std::complex<float>& x) {
-  return ::hypotf(real(x), imag(x));
+  return ::hypotf(x.real(), x.imag());
 }

 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double abs(const std::complex<double>& x) {
-  return ::hypot(real(x), imag(x));
+  return ::hypot(x.real(), x.imag());
 }
 #endif

@ -1312,11 +1338,12 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, true, false>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
  {
    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
  }
+  EIGEN_DEVICE_FUNC
  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
  {
    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
  // this range is +/-1.0f in single-precision.
  const T plus_9 = pset1<T>(9.f);
  const T minus_9 = pset1<T>(-9.f);
-  const T x = pmax(minus_9, pmin(plus_9, a_x));
-
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
  // The monomial coefficients of the numerator polynomial (odd).
  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -98,7 +98,7 @@ template<typename Derived> class MatrixBase
    /** \returns the size of the main diagonal, which is min(rows(),cols()).
      * \sa rows(), cols(), SizeAtCompileTime. */
    EIGEN_DEVICE_FUNC
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }

    typedef typename Base::PlainObject PlainObject;

@ -121,6 +121,7 @@ template<typename Derived> class MatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@ -129,6 +130,7 @@ template<typename Derived> class MatrixBase
 #     include EIGEN_MATRIXBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS

    /** Special case of the template operator=, in order to prevent the compiler
      * from generating a default operator= (issue hit with g++ 4.1)
@ -328,15 +330,11 @@ template<typename Derived> class MatrixBase

 /////////// LU module ///////////

-    EIGEN_DEVICE_FUNC
    inline const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> partialPivLu() const;

-    EIGEN_DEVICE_FUNC
    inline const PartialPivLU<PlainObject> lu() const;

-    EIGEN_DEVICE_FUNC
    inline const Inverse<Derived> inverse() const;

    template<typename ResultType>
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -265,7 +265,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dst.cols();
  for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
 }

 // Row major result
@ -278,7 +278,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dst.rows();
  for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
 }

 template<typename Lhs, typename Rhs>
@ -437,6 +437,18 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
  }

  // Everything below here is taken from CoeffBasedProduct.h
@ -503,8 +515,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),

-    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
-              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
              : 0,

    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
@ -590,7 +602,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
  }
 };

@ -600,7 +612,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
  {
    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
  }
 };

@ -609,7 +621,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
  }
 };

@ -618,7 +630,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
  {
-    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
  }
 };

@ -627,7 +639,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };

@ -636,7 +648,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
  }
 };

@ -645,7 +657,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
  }
@ -656,7 +668,7 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
  {
-    res = pset1<Packet>(0);
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
    for(Index i = 0; i < innerDim; ++i)
      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
  }
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CUDA_H
+#define EIGEN_COMPLEX_CUDA_H
+
+// clang-format off
+
+namespace Eigen {
+
+namespace internal {
+
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, clang does not treat them as device
+// functions and thus Eigen functors making use of these operators fail to
+// compile. Here, we manually specialize these functors for complex types when
+// building for CUDA to avoid non-constexpr methods.
+
+// Sum
+template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) + numext::real(b),
+                           numext::imag(a) + numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Difference
+template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    return std::complex<T>(numext::real(a) - numext::real(b),
+                           numext::imag(a) - numext::imag(b));
+  }
+};
+
+template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Product
+template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    return std::complex<T>(a_real * b_real - a_imag * b_imag,
+                           a_real * b_imag + a_imag * b_real);
+  }
+};
+
+template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
+
+
+// Quotient
+template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
+  enum {
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+  };
+  typedef typename std::complex<T> result_type;
+
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
+    const T a_real = numext::real(a);
+    const T a_imag = numext::imag(a);
+    const T b_real = numext::real(b);
+    const T b_imag = numext::imag(b);
+    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
+    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
+                           (a_imag * b_real - a_real * b_imag) * norm);
+  }
+};
+
+template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_CUDA_H
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@ -41,15 +41,15 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits

 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
  return __half2half2(from);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }

@ -57,17 +57,17 @@ template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }

 template<>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __ldg((const half2*)from);
 #else
@ -76,7 +76,7 @@ template<>
 }

 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 #if __CUDA_ARCH__ >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
@ -84,27 +84,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Ei
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
  result.x = a.x & 0x7FFF7FFF;
  return result;
 }


-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+__device__ EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
  __half a1 = __low2half(kernel.packet[0]);
  __half a2 = __high2half(kernel.packet[0]);
@ -114,7 +114,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
  kernel.packet[1] = __halves2half2(a2, b2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if __CUDA_ARCH__ >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
@ -123,7 +123,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hadd2(a, b);
 #else
@ -137,7 +137,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2&
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hsub2(a, b);
 #else
@ -151,7 +151,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2&
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hneg2(a);
 #else
@ -161,9 +161,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if __CUDA_ARCH__ >= 530
  return __hmul2(a, b);
 #else
@ -177,7 +177,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2&
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if __CUDA_ARCH__ >= 530
   return __hfma2(a, b, c);
 #else
@ -193,7 +193,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2&
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -203,7 +203,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -213,7 +213,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
  return __halves2half2(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -223,7 +223,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
  return __halves2half2(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
@ -233,7 +233,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@ -245,7 +245,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(c
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
@ -257,7 +257,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(c
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if __CUDA_ARCH__ >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
@ -267,7 +267,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(c
 #endif
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
@ -277,29 +277,29 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2

 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530

-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<>  __device__ EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+template<> __device__ EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }

 #else

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
@ -307,7 +307,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
@ -315,7 +315,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
@ -323,7 +323,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2&
  return __floats2half2_rn(r1, r2);
 }

-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@ -434,15 +434,16 @@ public:
  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
  {
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
    // let gcc allocate the register in which to store the result of the pmul
    // (in the case where there is no FMA) gcc fails to figure out how to avoid
    // spilling register.
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    EIGEN_UNUSED_VARIABLE(tmp);
-    c = pmadd(a,b,c);
+    c = cj.pmadd(a,b,c);
 #else
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
 #endif
  }

@ -457,9 +458,6 @@ public:
    r = pmadd(c,alpha,r);
  }

-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
 };

 template<typename RealScalar, bool _ConjLhs>
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
  {
    typedef typename Dest::Scalar ResScalar;
    typedef typename Rhs::Scalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
    
    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());

--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
    typedef internal::blas_traits<Rhs> RhsBlasTraits;
    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
    
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;

    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -14,12 +14,13 @@
  // 4512 - assignment operator could not be generated
  // 4522 - 'class' : multiple assignment operators specified
  // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)

 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@ -67,6 +68,8 @@
  #pragma diag_suppress 2669
  #pragma diag_suppress 2670
  #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
 #endif

 #endif // not EIGEN_WARNINGS_DISABLED
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -13,7 +13,7 @@

 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 93
+#define EIGEN_MINOR_VERSION 94

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -954,8 +954,8 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) catch (X)
 #else
 #  ifdef __CUDA_ARCH__
-#    define EIGEN_THROW_X(X) asm("trap;") return {}
-#    define EIGEN_THROW asm("trap;"); return {}
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -275,6 +275,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *
    destruct_elements_of_array(ptr, i);
    EIGEN_THROW;
  }
+  return NULL;
 }

 /*****************************************************************************
@ -305,6 +306,7 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
    aligned_free(result);
    EIGEN_THROW;
  }
+  return result;
 }

 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
@ -320,6 +322,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
    conditional_aligned_free<Align>(result);
    EIGEN_THROW;
  }
+  return result;
 }

 /** \internal Deletes objects constructed with aligned_new
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -671,6 +671,14 @@ struct scalar_div_cost {
  enum { value = 8*NumTraits<T>::MulCost };
 };

+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+

 template<bool Vectorized>
 struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@ -158,7 +158,8 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;

 /** Set \c *this from a \b unit quaternion.
-  * The resulting axis is normalized.
+  *
+  * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
  * 
  * This function implicitly normalizes the quaternion \a q.
  */
@ -167,12 +168,16 @@ template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
  using std::atan2;
+  using std::abs;
  Scalar n = q.vec().norm();
  if(n<NumTraits<Scalar>::epsilon())
    n = q.vec().stableNorm();
-  if (n > Scalar(0))
+
+  if (n != Scalar(0))
  {
-    m_angle = Scalar(2)*atan2(n, q.w());
+    m_angle = Scalar(2)*atan2(n, abs(q.w()));
+    if(q.w() < 0)
+      n = -n;
    m_axis  = q.vec() / n;
  }
  else
--- a/Eigen/src/Geometry/EulerAngles.h
+++ b/Eigen/src/Geometry/EulerAngles.h
@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
    res[0] = atan2(coeff(j,i), coeff(k,i));
    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
    {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
      res[1] = -atan2(s2, coeff(i,i));
    }
@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
    res[0] = atan2(coeff(j,k), coeff(k,k));
    Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+      if(res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      }
+      else {
+        res[0] += Scalar(EIGEN_PI);
+      }
      res[1] = atan2(-coeff(i,k), -c2);
    }
    else
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@ -402,6 +402,18 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
  }
 };

+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, TriangularShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
 template<typename Lhs,typename Rhs>
 struct homogeneous_left_product_refactoring_helper
 {
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@ -464,7 +464,7 @@ public:
    operator * (const DiagonalBase<DiagonalDerived> &b) const
  {
    TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
    return res;
  }

@ -578,7 +578,7 @@ public:
    return res;
  }

-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linearExt() *= s; return *this; }

  template<typename Derived>
  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
@ -853,7 +853,7 @@ Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &oth
 {
  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+  affine().noalias() = (other.asDiagonal() * affine());
  return *this;
 }

--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@ -130,8 +130,10 @@ public:
  }

  /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
+  template<typename Derived>
+  inline typename internal::enable_if<Derived::IsVectorAtCompileTime,VectorType>::type
+  operator* (const MatrixBase<Derived>& vec) const
+  { return m_coeffs + vec.derived(); }

  /** \returns the inverse translation (opposite) */
  Translation inverse() const { return Translation(-m_coeffs); }
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
  {
    *this *= Scalar(1)-tau;
  }
-  else
+  else if(tau!=Scalar(0))
  {
    Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
    Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
  {
    *this *= Scalar(1)-tau;
  }
-  else
+  else if(tau!=Scalar(0))
  {
    Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
    Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -156,7 +156,7 @@ template<typename _MatrixType> class FullPivLU
      *
      * \sa permutationQ()
      */
-    inline const PermutationPType& permutationP() const
+    EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const
    {
      eigen_assert(m_isInitialized && "LU is not initialized.");
      return m_p;
@ -406,8 +406,8 @@ template<typename _MatrixType> class FullPivLU

    MatrixType reconstructedMatrix() const;

-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }

    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -665,10 +665,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
  // only worsening the precision of U and V as we accumulate more rotations
  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();

-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  // FIXME What about considerering any denormal numbers as zero, using:
-  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();

  // Scaling factor to reduce over/under-flows
  RealScalar scale = matrix.cwiseAbs().maxCoeff();
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@ -141,6 +141,15 @@ template<typename Derived> class SparseMatrixBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN

 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,OP)           /** <p>This method does not change the sparsity of \c *this: the OP is applied to explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL      /** <p> \warning This method returns a read-only expression for any sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#endif
 #   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
@ -149,8 +158,10 @@ template<typename Derived> class SparseMatrixBase
 #   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
 #     include EIGEN_SPARSEMATRIXBASE_PLUGIN
 #   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF

    /** \returns the number of rows. \sa cols() */
    inline Index rows() const { return derived().rows(); }
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@ -36,8 +36,10 @@ typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> Negati

 #endif // not EIGEN_PARSED_BY_DOXYGEN

-/** \returns an expression of the opposite of \c *this
-  */
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-,opposite)
+///
 EIGEN_DEVICE_FUNC
 inline const NegativeReturnType
 operator-() const { return NegativeReturnType(derived()); }
@ -45,13 +47,15 @@ operator-() const { return NegativeReturnType(derived()); }

 template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };

-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
 template<typename NewType>
 EIGEN_DEVICE_FUNC
 typename CastXpr<NewType>::Type
@ -60,9 +64,11 @@ cast() const
  return typename CastXpr<NewType>::Type(derived());
 }

-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint() */
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
 EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
@ -70,39 +76,45 @@ conjugate() const
  return ConjugateReturnType(derived());
 }

-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline RealReturnType
 real() const { return RealReturnType(derived()); }

-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline const ImagReturnType
 imag() const { return ImagReturnType(derived()); }

-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
 template<typename CustomUnaryOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
@ -111,17 +123,19 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
  return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
 }

-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
+/// \returns an expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
 template<typename CustomViewOp>
 EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
@ -130,16 +144,20 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
  return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
 }

-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real,real part function)
+///
+/// \sa imag()
 EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
 real() { return NonConstRealReturnType(derived()); }

-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function)
+///
+/// \sa real()
 EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
 imag() { return NonConstImagReturnType(derived()); }
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@ -11,63 +11,75 @@
 // This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
 // This include MatrixBase and SparseMatrixBase.

+
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;

-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value)
+///
+/// \sa cwiseAbs2()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbsReturnType
 cwiseAbs() const { return CwiseAbsReturnType(derived()); }

-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value)
+///
+/// \sa cwiseAbs()
+///
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
 cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }

-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root)
+///
+/// \sa cwisePow(), cwiseSquare()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }

-/** \returns an expression of the coefficient-wise signum of *this.
-  *
-  * Example: \include MatrixBase_cwiseSign.cpp
-  * Output: \verbinclude MatrixBase_cwiseSign.out
-  *
-  */
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function)
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseSignReturnType
 cwiseSign() const { return CwiseSignReturnType(derived()); }


-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse)
+///
+/// \sa cwiseProduct()
+///
 EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
+
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@ -1,10 +1,13 @@

 find_package(BLAZE)
-find_package(Boost)
+find_package(Boost COMPONENTS system)
 if (BLAZE_FOUND AND Boost_FOUND)
  include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
  btl_add_bench(btl_blaze main.cpp)
+  # Note: The newest blaze version requires C++14.
+  # Ideally, we should set this depending on the version of Blaze we found
+  set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
  if(BUILD_btl_blaze)
-    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
+    target_link_libraries(btl_blaze ${Boost_LIBRARIES})
  endif()
 endif ()
--- a/doc/CustomizingEigen_NullaryExpr.dox
+++ b/doc/CustomizingEigen_NullaryExpr.dox
@ -53,6 +53,33 @@ showing that the program works as expected:

 This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.

+
+\section NullaryExpr_Indexing Example 2: indexing rows and columns
+
+The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
+
+\snippet nullary_indexing.out main1
+
+To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
+
+\snippet nullary_indexing.cpp functor
+
+Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
+
+\snippet nullary_indexing.cpp function
+
+Finally, here is an example of how this function can be used:
+
+\snippet nullary_indexing.cpp main1
+
+This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
+
+\snippet nullary_indexing.cpp main2
+
+and the output is:
+
+\snippet nullary_indexing.out main2
+
 */

 }
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -1612,7 +1612,10 @@ EXPAND_AS_DEFINED      = EIGEN_MAKE_TYPEDEFS \
                         EIGEN_EMPTY \
                         EIGEN_EULER_ANGLES_TYPEDEFS \
                         EIGEN_EULER_ANGLES_SINGLE_TYPEDEF \
-                         EIGEN_EULER_SYSTEM_TYPEDEF
+                         EIGEN_EULER_SYSTEM_TYPEDEF \
+                         EIGEN_DOC_UNARY_ADDONS \
+                         EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL \
+                         EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF

 # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
 # doxygen's preprocessor will remove all references to function-like macros
--- a/doc/examples/CMakeLists.txt
+++ b/doc/examples/CMakeLists.txt
@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
  )
  add_dependencies(all_examples ${example})
 endforeach(example_src)
+
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
+if(EIGEN_COMPILER_SUPPORT_CPP11)
+ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
+endif()
--- a/doc/examples/make_circulant2.cpp
+++ b/doc/examples/make_circulant2.cpp
@ -0,0 +1,52 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [circulant_func]
+template<class ArgType>
+class circulant_functor {
+  const ArgType &m_vec;
+public:
+  circulant_functor(const ArgType& arg) : m_vec(arg) {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    Index index = row - col;
+    if (index < 0) index += m_vec.size();
+    return m_vec(index);
+  }
+};
+// [circulant_func]
+
+// [square]
+template<class ArgType>
+struct circulant_helper {
+  typedef Matrix<typename ArgType::Scalar,
+                 ArgType::SizeAtCompileTime,
+                 ArgType::SizeAtCompileTime,
+                 ColMajor,
+                 ArgType::MaxSizeAtCompileTime,
+                 ArgType::MaxSizeAtCompileTime> MatrixType;
+};
+// [square]
+
+// [makeCirculant]
+template <class ArgType>
+CwiseNullaryOp<circulant_functor<ArgType>, typename circulant_helper<ArgType>::MatrixType>
+makeCirculant(const Eigen::MatrixBase<ArgType>& arg)
+{
+  typedef typename circulant_helper<ArgType>::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(arg.size(), arg.size(), circulant_functor<ArgType>(arg.derived()));
+}
+// [makeCirculant]
+
+// [main]
+int main()
+{
+  Eigen::VectorXd vec(4);
+  vec << 1, 2, 4, 8;
+  Eigen::MatrixXd mat;
+  mat = makeCirculant(vec);
+  std::cout << mat << std::endl;
+}
+// [main]
--- a/doc/examples/nullary_indexing.cpp
+++ b/doc/examples/nullary_indexing.cpp
@ -0,0 +1,66 @@
+#include <Eigen/Core>
+#include <iostream>
+
+using namespace Eigen;
+
+// [functor]
+template<class ArgType, class RowIndexType, class ColIndexType>
+class indexing_functor {
+  const ArgType &m_arg;
+  const RowIndexType &m_rowIndices;
+  const ColIndexType &m_colIndices;
+public:
+  typedef Matrix<typename ArgType::Scalar,
+                 RowIndexType::SizeAtCompileTime,
+                 ColIndexType::SizeAtCompileTime,
+                 ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
+                 RowIndexType::MaxSizeAtCompileTime,
+                 ColIndexType::MaxSizeAtCompileTime> MatrixType;
+
+  indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+    : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
+  {}
+
+  const typename ArgType::Scalar& operator() (Index row, Index col) const {
+    return m_arg(m_rowIndices[row], m_colIndices[col]);
+  }
+};
+// [functor]
+
+// [function]
+template <class ArgType, class RowIndexType, class ColIndexType>
+CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
+indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+{
+  typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
+  typedef typename Func::MatrixType MatrixType;
+  return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
+}
+// [function]
+
+
+int main()
+{
+  std::cout << "[main1]\n";
+  Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
+  Array3i ri(1,2,1);
+  ArrayXi ci(6); ci << 3,2,1,0,0,2;
+  Eigen::MatrixXi B = indexing(A, ri, ci);
+  std::cout << "A =" << std::endl;
+  std::cout << A << std::endl << std::endl;
+  std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
+  std::cout << B << std::endl;
+  std::cout << "[main1]\n";
+
+  std::cout << "[main2]\n";
+  B =  indexing(A, ri+1, ci);
+  std::cout << "A(ri+1,ci) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#if __cplusplus >= 201103L
+  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+  std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
+  std::cout << B << std::endl << std::endl;
+#endif
+  std::cout << "[main2]\n";
+}
+
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -355,7 +355,7 @@ if(CUDA_FOUND)
  
  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
  endif()
  if(EIGEN_TEST_CUDA_CLANG)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@ -417,6 +417,7 @@ void cholesky_faillure_cases()
    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
    VERIFY(ldlt.info()==NumericalIssue);
  }
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
  {
    mat.resize(3,3);
    mat << -1, -3, 3,
@ -426,6 +427,7 @@ void cholesky_faillure_cases()
    VERIFY(ldlt.info()==NumericalIssue);
    VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
  }
+#endif
  {
    mat.resize(3,3);
    mat <<  1, 2, 3,
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
    VERIFY( !m.allFinite() );
    VERIFY(  m.hasNaN() );
  }
-  m(4) /= T(0.0);
+  T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
+  m(4) /= hidden_zero;
  if(dryrun)
  {
    std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@ -111,6 +111,8 @@ template<typename Scalar,int Size> void homogeneous(void)
  
  VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
  VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
+
+  VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }

 void test_geo_homogeneous()
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@ -334,6 +334,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
  t0.scale(v0);
  t1 *= AlignedScaling3(v0);
  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t1 = AlignedScaling3(v0) * (Translation3(v0) * Transform3(q1));
+  t1 = t1 * v0.asDiagonal();
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
  // transformation * translation
  t0.translate(v0);
  t1 = t1 * Translation3(v0);
@ -482,6 +485,79 @@ template<typename Scalar, int Mode, int Options> void transformations()
    Rotation2D<Scalar> r2(r1);       // copy ctor
    VERIFY_IS_APPROX(r2.angle(),s0);
  }
+
+  {
+    Transform3 t32(Matrix4::Random()), t33, t34;
+    t34 = t33 = t32;
+    t32.scale(v0);
+    t33*=AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+    t33 = t34 * AlignedScaling3(v0);
+    VERIFY_IS_APPROX(t32.matrix(), t33.matrix());
+  }
+
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity_left(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( q*(a1*v), (q*a1)*v );
+  VERIFY_IS_APPROX( q*(a2*v), (q*a2)*v );
+  VERIFY_IS_APPROX( q*(p*h).hnormalized(),  ((q*p)*h).hnormalized() );
+}
+
+template<typename A1, typename A2, typename P, typename Q, typename V, typename H>
+void transform_associativity2(const A1& a1, const A2& a2, const P& p, const Q& q, const V& v, const H& h)
+{
+  VERIFY_IS_APPROX( a1*(q*v), (a1*q)*v );
+  VERIFY_IS_APPROX( a2*(q*v), (a2*q)*v );
+  VERIFY_IS_APPROX( p *(q*v).homogeneous(), (p *q)*v.homogeneous() );
+
+  transform_associativity_left(a1, a2,p, q, v, h);
+}
+
+template<typename Scalar, int Dim, int Options,typename RotationType>
+void transform_associativity(const RotationType& R)
+{
+  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar,Dim+1,1> HVectorType;
+  typedef Matrix<Scalar,Dim,Dim> LinearType;
+  typedef Matrix<Scalar,Dim+1,Dim+1> MatrixType;
+  typedef Transform<Scalar,Dim,AffineCompact,Options> AffineCompactType;
+  typedef Transform<Scalar,Dim,Affine,Options> AffineType;
+  typedef Transform<Scalar,Dim,Projective,Options> ProjectiveType;
+  typedef DiagonalMatrix<Scalar,Dim> ScalingType;
+  typedef Translation<Scalar,Dim> TranslationType;
+
+  AffineCompactType A1c; A1c.matrix().setRandom();
+  AffineCompactType A2c; A2c.matrix().setRandom();
+  AffineType A1(A1c);
+  AffineType A2(A2c);
+  ProjectiveType P1; P1.matrix().setRandom();
+  VectorType v1 = VectorType::Random();
+  VectorType v2 = VectorType::Random();
+  HVectorType h1 = HVectorType::Random();
+  Scalar s1 = internal::random<Scalar>();
+  LinearType L = LinearType::Random();
+  MatrixType M = MatrixType::Random();
+
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, A2c, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, v1.asDiagonal(), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, ScalingType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, Scaling(s1), v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, TranslationType(v1), v2, h1) );
+  CALL_SUBTEST( transform_associativity_left(A1c, A1, P1, L, v2, h1) );
+  CALL_SUBTEST( transform_associativity2(A1c, A1, P1, R, v2, h1) );
+
+  VERIFY_IS_APPROX( A1*(M*h1), (A1*M)*h1 );
+  VERIFY_IS_APPROX( A1c*(M*h1), (A1c*M)*h1 );
+  VERIFY_IS_APPROX( P1*(M*h1), (P1*M)*h1 );
+
+  VERIFY_IS_APPROX( M*(A1*h1), (M*A1)*h1 );
+  VERIFY_IS_APPROX( M*(A1c*h1), (M*A1c)*h1 );
+  VERIFY_IS_APPROX( M*(P1*h1),  ((M*P1)*h1) );
 }

 template<typename Scalar> void transform_alignment()
@ -562,5 +638,8 @@ void test_geo_transformations()

    CALL_SUBTEST_7(( transform_products<double,3,RowMajor|AutoAlign>() ));
    CALL_SUBTEST_7(( transform_products<float,2,AutoAlign>() ));
+
+    CALL_SUBTEST_8(( transform_associativity<double,2,ColMajor>(Rotation2D<double>(internal::random<double>()*double(EIGEN_PI))) ));
+    CALL_SUBTEST_8(( transform_associativity<double,3,ColMajor>(Quaterniond::UnitRandom()) ));
  }
 }
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
  }

  if (PacketTraits::HasTanh) {
+    // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
    packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
    h.store(data2, internal::ptanh(h.load(data1)));
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@ -256,6 +256,51 @@ Index compute_block_size()
  return ret;
 }

+
+
+template<int>
+void bug_1308()
+{
+  int n = 10;
+  MatrixXd r(n,n);
+  VectorXd v = VectorXd::Random(n);
+  r = v * RowVectorXd::Ones(n);
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n));
+  r = VectorXd::Ones(n) * v.transpose();
+  VERIFY_IS_APPROX(r, v.rowwise().replicate(n).transpose());
+
+  Matrix4d ones44 = Matrix4d::Ones();
+  Matrix4d m44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(m44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(m44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+  typedef Matrix<double,4,4,RowMajor> RMatrix4d;
+  RMatrix4d r44 = Matrix4d::Ones() * Matrix4d::Ones();
+  VERIFY_IS_APPROX(r44,Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*Matrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=Matrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=ones44.transpose()*RMatrix4d::Ones(), Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44, Matrix4d::Constant(4));
+  VERIFY_IS_APPROX(r44.noalias()=RMatrix4d::Ones()*ones44.transpose(), Matrix4d::Constant(4));
+
+//   RowVector4d r4;
+  m44.setOnes();
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.row(0).transpose() * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += m44.col(0) * RowVector4d::Ones(), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.row(0), ones44);
+  r44.setZero();
+  VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
+}
+
 void test_product_extra()
 {
  for(int i = 0; i < g_repeat; i++) {
@ -268,8 +313,10 @@ void test_product_extra()
  }
  CALL_SUBTEST_5( bug_127<0>() );
  CALL_SUBTEST_5( bug_817<0>() );
+  CALL_SUBTEST_5( bug_1308<0>() );
  CALL_SUBTEST_6( unaligned_objects<0>() );
  CALL_SUBTEST_7( compute_block_size<float>() );
  CALL_SUBTEST_7( compute_block_size<double>() );
  CALL_SUBTEST_7( compute_block_size<std::complex<double> >() );
+
 }
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@ -12,6 +12,7 @@
 #include <Eigen/LU>

 // regression test for bug 447
+template<int>
 void product1x1()
 {
  Matrix<float,1,3> matAstatic;
@ -209,15 +210,34 @@ void test_linear_but_not_vectorizable()
  }
 }

+template<int Rows>
+void bug_1311()
+{
+  Matrix< double, Rows, 2 > A;  A.setRandom();
+  Vector2d b = Vector2d::Random() ;
+  Matrix<double,Rows,1> res;
+  res.noalias() = 1. * (A * b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = 1.*A * b;
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (1.*A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+  res.noalias() = (A).lazyProduct(1.*b);
+  VERIFY_IS_APPROX(res, A*b);
+}
+
 void test_product_small()
 {
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
-    CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
+    CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
+    CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
    CALL_SUBTEST_3( product(Matrix3d()) );
    CALL_SUBTEST_4( product(Matrix4d()) );
    CALL_SUBTEST_5( product(Matrix4f()) );
-    CALL_SUBTEST_6( product1x1() );
+    CALL_SUBTEST_6( product1x1<0>() );

    CALL_SUBTEST_11( test_lazy_l1<float>() );
    CALL_SUBTEST_12( test_lazy_l2<float>() );
@ -238,6 +258,9 @@ void test_product_small()
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,Dynamic>() ));
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,3,1,Dynamic>() ));
    CALL_SUBTEST_7(( test_linear_but_not_vectorizable<float,2,1,16>() ));
+
+    CALL_SUBTEST_6( bug_1311<3>() );
+    CALL_SUBTEST_6( bug_1311<5>() );
  }

 #ifdef EIGEN_TEST_PART_6
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@ -7,6 +7,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+template<typename T>
+Array<T,4,1> four_denorms();
+
+template<>
+Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
+template<>
+Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
+template<typename T>
+Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
+
 template<typename MatrixType>
 void svd_fill_random(MatrixType &m, int Option = 0)
 {
@ -55,7 +65,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
  }
  
  Matrix<Scalar,Dynamic,1> samples(9);
-  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
+  samples << 0, four_denorms<RealScalar>(),
+            -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest(), (std::numeric_limits<RealScalar>::min)(), pow((std::numeric_limits<RealScalar>::min)(),0.8);
  
  if(Option==Symmetric)
  {
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -61,8 +61,9 @@ typedef unsigned __int64 uint64_t;
 #ifdef EIGEN_USE_GPU
 #include <iostream>
 #include <cuda_runtime.h>
-#if defined(__CUDACC__)
-#include <curand_kernel.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
 #endif
 #endif

@ -81,6 +82,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorDimensions.h"
 #include "src/Tensor/TensorInitializer.h"
 #include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
 #include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
 #include "src/Tensor/TensorGlobalFunctions.h"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@ -51,12 +51,15 @@ class TensorOpCost {
        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
  }

+  EIGEN_DEVICE_FUNC
  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
      : bytes_loaded_(bytes_loaded),
        bytes_stored_(bytes_stored),
        compute_cycles_(compute_cycles) {}

+  EIGEN_DEVICE_FUNC
  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
               bool vectorized, double packet_size)
      : bytes_loaded_(bytes_loaded),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@ -42,7 +42,21 @@ static bool m_devicePropInitialized = false;

 static void initializeDeviceProp() {
  if (!m_devicePropInitialized) {
-    if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
      int num_devices;
      cudaError_t status = cudaGetDeviceCount(&num_devices);
      if (status != cudaSuccess) {
@ -63,7 +77,19 @@ static void initializeDeviceProp() {
          assert(status == cudaSuccess);
        }
      }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
      m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
    }
  }
 }
@ -168,39 +194,20 @@ struct GpuDevice {
    return stream_->stream();
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    return stream_->allocate(num_bytes);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    stream_->deallocate(buffer);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* scratchpad() const {
    return stream_->scratchpad();
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
    return stream_->semaphore();
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@ -210,30 +217,22 @@ struct GpuDevice {
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
    cudaError_t err =
        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
    cudaError_t err =
        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@ -242,21 +241,21 @@ struct GpuDevice {
    EIGEN_UNUSED_VARIABLE(err)
    assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE size_t numThreads() const {
    // FIXME
    return 32;
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
    // FIXME
    return 48*1024;
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
    // We won't try to take advantage of the l2 cache for the time being, and
    // there is no l3 cache on cuda devices.
    return firstLevelCacheSize();
@ -276,56 +275,26 @@ struct GpuDevice {
 #endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
    return stream_->deviceProperties().multiProcessorCount;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
    return stream_->deviceProperties().maxThreadsPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
    return stream_->deviceProperties().sharedMemPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
    return stream_->deviceProperties().major;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
    return stream_->deviceProperties().minor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
  }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+  EIGEN_STRONG_INLINE int maxBlocks() const {
    return max_blocks_;
  }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -234,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {

  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
  const Index step_size = blockDim.x * gridDim.x;

-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
-
  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -99,7 +99,8 @@ template <typename T> struct SumReducer
  static const bool IsStateful = false;

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
  }
  template <typename Packet>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@ -145,7 +146,8 @@ template <typename T> struct MeanReducer
  MeanReducer() : scalarCount_(0), packetCount_(0) { }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
    scalarCount_++;
  }
  template <typename Packet>
@ -190,25 +192,25 @@ struct reducer_traits<MeanReducer<T>, Device> {

 template <typename T, bool IsMax = true, bool IsInteger = true>
 struct MinMaxBottomValue {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::lowest();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, true, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return -Eigen::NumTraits<T>::infinity();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, true> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::highest();
  }
 };
 template <typename T>
 struct MinMaxBottomValue<T, false, false> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
    return Eigen::NumTraits<T>::infinity();
  }
 };
@ -439,448 +441,6 @@ struct reducer_traits<ArgMinTupleReducer<T>, Device> {
 };


-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
-    return clock();
-}
-#else
-static inline int get_random_seed() {
-#ifdef _WIN32
-    SYSTEMTIME st;
-    GetSystemTime(&st);
-    return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
-    return static_cast<int>(mach_absolute_time());
-#else
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    if (!deterministic) {
-      srand(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-  }
-
-  T operator()() const {
-    return random<T>();
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = random<T>();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other() * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  float operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX float values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other() * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  double operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX double values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-     curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ float operator()() const {
-    return curand_uniform(&m_state);
-  }
-  template<typename PacketType>
-  __device__ float4 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ double operator()() const {
-    return curand_uniform_double(&m_state);
-  }
-  template<typename PacketType>
-  __device__ double2 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<float> operator()() const {
-    float4 vals = curand_uniform4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<double> operator()() const {
-    double2 vals = curand_uniform2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  NormalRandomGenerator(const NormalRandomGenerator& other)
-      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other() * UINT_MAX);
-  }
-  ~NormalRandomGenerator() {
-    delete m_generator;
-  }
-  T operator()() const {
-    return m_distribution(*m_generator);
-  }
-  template<typename PacketType>
-  PacketType packetOp() const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = m_distribution(*m_generator);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  // No assignment
-  NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
-  bool m_deterministic;
-  mutable std::normal_distribution<T> m_distribution;
-  std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ float operator()() const {
-    return curand_normal(&m_state);
-  }
-  template<typename PacketType>
-   __device__ float4 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ double operator()() const {
-    return curand_normal_double(&m_state);
-  }
-  template<typename PacketType>
-  __device__ double2 packetOp() const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<float> operator()() const {
-    float4 vals = curand_normal4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ std::complex<double> operator()() const {
-    double2 vals = curand_normal2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = false;
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
-  bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
 template <typename T, typename Index, size_t NumDims>
 class GaussianGenerator {
 public:
@ -895,7 +455,7 @@ class GaussianGenerator {
    }
  }

-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
    T tmp = T(0);
    for (size_t i = 0; i < NumDims; ++i) {
      T offset = coordinates[i] - m_means[i];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  assert(threadIdx.z == 0);
+  return clock64() +
+      blockIdx.x * blockDim.x + threadIdx.x +
+      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+  // Use the current time as a baseline.
+  GetSystemTime(&st);
+  int time = st.wSecond + 1000 * st.wMilliseconds;
+  // Mix in a random number to make sure that we get different seeds if
+  // we try to generate seeds faster than the clock resolution.
+  // We need 2 random values since the generator only generate 16 bits at
+  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+  SYSTEMTIME st;
+  uint rnd1 = ::rand();
+  uint rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+  return rnd;
+
+#elif defined __APPLE__
+  // Same approach as for win32, except that the random number generator
+  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+  uint64_t rnd = ::random() ^ mach_absolute_time();
+  return rnd;
+
+#else
+  // Augment the current time with pseudo random number generation
+  // to ensure that we get different seeds if we try to generate seeds
+  // faster than the clock resolution.
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  uint64_t rnd = ::random() ^ ts.tv_nsec;
+  return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+}  // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+  Eigen::half result;
+  // Generate 10 random bits for the mantissa
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+  // Set the exponent
+  result.x |= (static_cast<uint16_t>(15) << 10);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost *
+           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+  } while (q > T(0.27597) &&
+           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+  return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+      const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@ -20,37 +20,60 @@ public:
  AutoDiffJacobian(const Functor& f) : Functor(f) {}

  // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... T>
+  AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
  template<typename T0>
  AutoDiffJacobian(const T0& a0) : Functor(a0) {}
  template<typename T0, typename T1>
  AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
  template<typename T0, typename T1, typename T2>
  AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
-
-  enum {
-    InputsAtCompileTime = Functor::InputsAtCompileTime,
-    ValuesAtCompileTime = Functor::ValuesAtCompileTime
-  };
+#endif

  typedef typename Functor::InputType InputType;
  typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
-  typedef typename JacobianType::Scalar Scalar;
+  typedef typename ValueType::Scalar Scalar;
+
+  enum {
+    InputsAtCompileTime = InputType::RowsAtCompileTime,
+    ValuesAtCompileTime = ValueType::RowsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
  typedef typename JacobianType::Index Index;

-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
  typedef AutoDiffScalar<DerivativeType> ActiveScalar;

-
  typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
  typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;

+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE
+  void operator() (const InputType& x, ValueType* v) const
+  {
+      this->operator()(x, v, 0);
+  }
+  template<typename... ParamsType>
+  void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+                   const ParamsType&... Params) const
+#else
  void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
  {
    eigen_assert(v!=0);
+
    if (!_jac)
    {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+      Functor::operator()(x, v, Params...);
+#else
      Functor::operator()(x, v);
+#endif
      return;
    }

@ -61,12 +84,16 @@ public:

    if(InputsAtCompileTime==Dynamic)
      for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+        av[j].derivatives().resize(x.rows());

    for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+      ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);

+#if EIGEN_HAS_VARIADIC_TEMPLATES
+    Functor::operator()(ax, &av, Params...);
+#else
    Functor::operator()(ax, &av);
+#endif

    for (Index i=0; i<jac.rows(); i++)
    {
@ -74,8 +101,6 @@ public:
      jac.row(i) = av[i].derivatives();
    }
  }
-protected:
-
 };

 }
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@ -189,7 +189,12 @@ namespace Eigen
      res[0] = atan2(mat(J,K), mat(K,K));
      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
        res[1] = atan2(-mat(I,K), -c2);
      }
      else
@ -212,7 +217,12 @@ namespace Eigen
      res[0] = atan2(mat(J,I), mat(K,I));
      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
      {
-        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
        res[1] = -atan2(s2, mat(I,I));
      }
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -203,7 +203,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})

  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
  endif()
  if(EIGEN_TEST_CUDA_CLANG)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")

  ei_add_test(cxx11_tensor_complex_cuda)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
  ei_add_test(cxx11_tensor_reduction_cuda)
  ei_add_test(cxx11_tensor_argmax_cuda)
  ei_add_test(cxx11_tensor_cast_float16_cuda)
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@ -105,6 +105,89 @@ struct TestFunc1
  }
 };

+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
 template<typename Func> void forward_jacobian(const Func& f)
 {
    typename Func::InputType x = Func::InputType::Random(f.inputs());
@ -128,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f)
    VERIFY_IS_APPROX(j, jref);
 }

-
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_scalar()
@ -141,6 +223,7 @@ void test_autodiff_scalar()
  VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
 }

+
 // TODO also check actual derivatives!
 template <int>
 void test_autodiff_vector()
@ -151,7 +234,7 @@ void test_autodiff_vector()
  VectorAD ap = p.cast<AD>();
  ap.x().derivatives() = Vector2f::UnitX();
  ap.y().derivatives() = Vector2f::UnitY();
-  
+
  AD res = foo<VectorAD>(ap);
  VERIFY_IS_APPROX(res.value(), foo(p));
 }
@ -164,6 +247,9 @@ void test_autodiff_jacobian()
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
  CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
 }


--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@ -71,8 +71,45 @@ void test_cuda_nullary() {
 }


+static void test_cuda_sum_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+

 void test_cxx11_tensor_complex()
 {
  CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
 }
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@ -0,0 +1,97 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add = 0,
+    Sub,
+    Mul,
+    Div
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op <= Div; op++) {
+    std::complex<T> expected;
+    switch (static_cast<CwiseOp>(op)) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_complex_cwise_ops()
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}