From abdbe8562e889a0ca0877d607cfd5c4cbf937e3a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 24 Mar 2015 10:45:46 -0700 Subject: [PATCH 1/4] Fixed the CUDA packet primitives --- Eigen/src/Core/arch/CUDA/PacketMath.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 19749c832..ceed1d1ef 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(cons } #endif -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); } -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { return make_double2(from[0*stride], from[1*stride]); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; to[stride*2] = from.z; to[stride*3] = from.w; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; } @@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) } template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(abs(a.x), abs(a.y)); + return make_double2(fabs(a.x), fabs(a.y)); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; @@ -279,7 +279,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].z = tmp; } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; From ccf290a65cda00bfe12bbd5f4647aca5b371b6fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Mar 2015 12:37:38 -0700 Subject: [PATCH 2/4] Cleaned up the TensorDevice code a little bit. --- unsupported/Eigen/CXX11/Tensor | 2 +- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 25 +++++++------------ 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 34107ae71..200bcf966 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -80,8 +80,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 7a67c56b3..b6ea655f3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -32,8 +32,7 @@ template class TensorDevice { EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -44,8 +43,7 @@ template class TensorDevice { Sum sum(m_expression, other); typedef TensorAssignOp Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -56,8 +54,7 @@ template class TensorDevice { Difference difference(m_expression, other); typedef TensorAssignOp Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -76,8 +73,7 @@ template class TensorDevice Assign; Assign assign(m_expression, other); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -88,8 +84,7 @@ template class TensorDevice Assign; Assign assign(m_expression, sum); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -100,8 +95,7 @@ template class TensorDevice Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -122,7 +116,7 @@ template class TensorDevice EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; Assign assign(m_expression, other); - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -133,7 +127,7 @@ template class TensorDevice Sum sum(m_expression, other); typedef TensorAssignOp Assign; Assign assign(m_expression, sum); - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -144,8 +138,7 @@ template class TensorDevice Difference difference(m_expression, other); typedef TensorAssignOp Assign; Assign assign(m_expression, difference); - static const bool Vectorize = TensorEvaluator::PacketAccess; - internal::TensorExecutor::run(assign, m_device); + internal::TensorExecutor::run(assign, m_device); return *this; } From b3343bfdae40815ae9e01ad2bd8fa226925248c8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Mar 2015 13:25:53 -0700 Subject: [PATCH 3/4] Fixed the vectorized implementation of the Tensor select() method --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d084880de..9198c17ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -352,11 +352,12 @@ template, Device> { typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & - TensorEvaluator::PacketAccess*/, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::packet_traits::HasBlend, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented }; @@ -373,7 +374,6 @@ struct TensorEvaluator } typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -403,7 +403,7 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - static const int PacketSize = internal::unpacket_traits::size; + const int PacketSize = internal::unpacket_traits::size; internal::Selector select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); From 4df8b5a75e76a2f99e623da2a59cb9d6f591b914 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Mar 2015 14:36:07 -0700 Subject: [PATCH 4/4] Avoid making an unecessary copy of the tensor expression when evaluating it on a GPU device --- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index b6ea655f3..17f10c07b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -144,7 +144,7 @@ template class TensorDevice protected: const GpuDevice& m_device; - ExpressionType m_expression; + ExpressionType& m_expression; }; #endif