Merged from trunk

2025-09-26 00:03:14 +08:00 · 2014-10-30 21:59:22 -07:00 · 2014-10-30 21:59:22 -07:00 · 67fcf47ecb
commit 67fcf47ecb
parent fcecafde3a d62bfe73a9
5 changed files with 38 additions and 21 deletions
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, c
  to[stride*1] = from.y;
 }

+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(abs(a.x), abs(a.y));
+}
+
+
 template<> EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
  double tmp = kernel.packet[0].y;
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -297,7 +297,9 @@ namespace Eigen {
 * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
 * vectorized and non-vectorized code.
 */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+#if (defined __CUDACC__)
+#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif (defined _MSC_VER)
  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -149,26 +149,26 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>

 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <typename Evaluator>
+template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, unsigned int size) {
+ EigenMetaKernel(Evaluator eval, Index size) {

-  const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const int step_size = blockDim.x * gridDim.x;
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;

  if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
    // Use the scalar path
-    for (int i = first_index; i < size; i += step_size) {
+    for (Index i = first_index; i < size; i += step_size) {
      eval.evalScalar(i);
    }
  }
  else {
    // Use the vector path
-    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const int vectorized_step_size = step_size * PacketSize;
-    const int vectorized_size = (size / PacketSize) * PacketSize;
-    int i = first_index * PacketSize;
+    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const Index vectorized_step_size = step_size * PacketSize;
+    const Index vectorized_size = (size / PacketSize) * PacketSize;
+    Index i = first_index * PacketSize;
    for ( ; i < vectorized_size; i += vectorized_step_size) {
      eval.evalPacket(i);
    }
@ -193,7 +193,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable>
      const int block_size = maxCudaThreadsPerBlock();

      const Index size = array_prod(evaluator.dimensions());
-      EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
+      EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index><<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
      assert(cudaGetLastError() == cudaSuccess);
    }
    evaluator.cleanup();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -459,7 +459,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
    }
    else {
-      CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
      this->m_impl.coeffRef(inputIndices[0]) = values[0];
      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    for (int i = 0; i < NumDims; ++i) {
      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
    }
-
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    m_inputStrides[0] = 1;
    m_outputStrides[0] = 1;
@ -125,6 +124,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
+    eigen_assert(index < dimensions().TotalSize());
    Index inputIndex = 0;
    for (int i = NumDims - 1; i > 0; --i) {
      const Index idx = index / m_outputStrides[i];
@ -151,11 +151,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    const Index initialIndex = index;
    Index inputIndex = 0;
    for (int i = NumDims - 1; i > 0; --i) {
-      const int first = index;
-      const int last = index + packetSize - 1;
-      const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
-      const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
-      const int lastPaddedRight = m_outputStrides[i+1];
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i+1];

      if (last < lastPaddedLeft) {
        // all the coefficient are in the padding zone.
@ -179,9 +179,9 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device

    const Index last = index + packetSize - 1;
    const Index first = index;
-    const int lastPaddedLeft = m_padding[0].first;
-    const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
-    const int lastPaddedRight = m_outputStrides[1];
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];

    if (last < lastPaddedLeft) {
      // all the coefficient are in the padding zone.