diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 5b0abe2e6..7b481d512 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, c to[stride*1] = from.y; } +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + template<> EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8fdd7d898..001907a0b 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -297,7 +297,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) +#if (defined __CUDACC__) +#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif (defined _MSC_VER) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 01fa04c64..4fa8e83ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -149,26 +149,26 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, unsigned int size) { + EigenMetaKernel(Evaluator eval, Index size) { - const int first_index = blockIdx.x * blockDim.x + threadIdx.x; - const int step_size = blockDim.x * gridDim.x; + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { // Use the scalar path - for (int i = first_index; i < size; i += step_size) { + for (Index i = first_index; i < size; i += step_size) { eval.evalScalar(i); } } else { // Use the vector path - const int PacketSize = unpacket_traits::size; - const int vectorized_step_size = step_size * PacketSize; - const int vectorized_size = (size / PacketSize) * PacketSize; - int i = first_index * PacketSize; + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + Index i = first_index * PacketSize; for ( ; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } @@ -193,7 +193,7 @@ class TensorExecutor const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel > <<>>(evaluator, size); + EigenMetaKernel, Index><<>>(evaluator, size); assert(cudaGetLastError() == cudaSuccess); } evaluator.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3447592eb..33849ed3e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -459,7 +459,7 @@ struct TensorEvaluator, Device> this->m_impl.template writePacket(inputIndices[0], x); } else { - CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 89c0cff05..d6347b054 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -98,7 +98,6 @@ struct TensorEvaluator, Device for (int i = 0; i < NumDims; ++i) { m_dimensions[i] += m_padding[i].first + m_padding[i].second; } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); m_inputStrides[0] = 1; m_outputStrides[0] = 1; @@ -125,6 +124,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -151,11 +151,11 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const int first = index; - const int last = index + packetSize - 1; - const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const int lastPaddedRight = m_outputStrides[i+1]; + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. @@ -179,9 +179,9 @@ struct TensorEvaluator, Device const Index last = index + packetSize - 1; const Index first = index; - const int lastPaddedLeft = m_padding[0].first; - const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const int lastPaddedRight = m_outputStrides[1]; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone.