From abdbe8562e889a0ca0877d607cfd5c4cbf937e3a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 24 Mar 2015 10:45:46 -0700
Subject: [PATCH 1/4] Fixed the CUDA packet primitives

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 19749c832..ceed1d1ef 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(cons
 }
 #endif
 
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
   return make_double2(from[0*stride], from[1*stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
   to[stride*2] = from.z;
   to[stride*3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
   to[stride*0] = from.x;
   to[stride*1] = from.y;
 }
@@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a)
 }
 
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(abs(a.x), abs(a.y));
+  return make_double2(fabs(a.x), fabs(a.y));
 }
 
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
@@ -279,7 +279,7 @@ ptranspose(PacketBlock<float4,4>& kernel) {
   kernel.packet[3].z = tmp;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void
+EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<double2,2>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;

From ccf290a65cda00bfe12bbd5f4647aca5b371b6fb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Mar 2015 12:37:38 -0700
Subject: [PATCH 2/4] Cleaned up the TensorDevice code a little bit.

---
 unsupported/Eigen/CXX11/Tensor                |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorDevice.h     | 25 +++++++------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 34107ae71..200bcf966 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -80,8 +80,8 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 7a67c56b3..b6ea655f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -32,8 +32,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
       Assign assign(m_expression, other);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
       return *this;
     }
 
@@ -44,8 +43,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       Sum sum(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Sum> Assign;
       Assign assign(m_expression, sum);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
       return *this;
     }
 
@@ -56,8 +54,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       Difference difference(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Difference> Assign;
       Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
-      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
       return *this;
     }
 
@@ -76,8 +73,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
       Assign assign(m_expression, other);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
       return *this;
     }
 
@@ -88,8 +84,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       Sum sum(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Sum> Assign;
       Assign assign(m_expression, sum);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
       return *this;
     }
 
@@ -100,8 +95,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       Difference difference(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Difference> Assign;
       Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
       return *this;
     }
 
@@ -122,7 +116,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
       Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
       return *this;
     }
 
@@ -133,7 +127,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       Sum sum(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Sum> Assign;
       Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
       return *this;
     }
 
@@ -144,8 +138,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       Difference difference(m_expression, other);
       typedef TensorAssignOp<ExpressionType, const Difference> Assign;
       Assign assign(m_expression, difference);
-      static const bool Vectorize = TensorEvaluator<const Assign, GpuDevice>::PacketAccess;
-      internal::TensorExecutor<const Assign, GpuDevice, Vectorize>::run(assign, m_device);
+      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
       return *this;
     }
 

From b3343bfdae40815ae9e01ad2bd8fa226925248c8 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Mar 2015 13:25:53 -0700
Subject: [PATCH 3/4] Fixed the vectorized implementation of the Tensor
 select() method

---
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d084880de..9198c17ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -352,11 +352,12 @@ template<typename IfArgType, typename ThenArgType, typename ElseArgType, typenam
 struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
 {
   typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;
 
   enum {
     IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
-                                                                                                             TensorEvaluator<IfArgType>::PacketAccess*/,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                   internal::packet_traits<Scalar>::HasBlend,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
   };
@@ -373,7 +374,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   }
 
   typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename internal::traits<XprType>::Packet PacketReturnType;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
@@ -403,7 +403,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-    static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
     for (Index i = 0; i < PacketSize; ++i) {
       select.select[i] = m_condImpl.coeff(index+i);

From 4df8b5a75e76a2f99e623da2a59cb9d6f591b914 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 25 Mar 2015 14:36:07 -0700
Subject: [PATCH 4/4] Avoid making an unecessary copy of the tensor expression
 when evaluating it on a GPU device

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index b6ea655f3..17f10c07b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -144,7 +144,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
 
   protected:
     const GpuDevice& m_device;
-    ExpressionType m_expression;
+    ExpressionType& m_expression;
 };
 #endif