diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 5b0abe2e6..7b481d512 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, c
   to[stride*1] = from.y;
 }
 
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(abs(a.x), abs(a.y));
+}
+
+
 template<> EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 8fdd7d898..001907a0b 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -297,7 +297,9 @@ namespace Eigen {
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+#if (defined __CUDACC__)
+#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif (defined _MSC_VER)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 01fa04c64..4fa8e83ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -149,26 +149,26 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <typename Evaluator>
+template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, unsigned int size) {
+ EigenMetaKernel(Evaluator eval, Index size) {
 
-  const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const int step_size = blockDim.x * gridDim.x;
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
 
   if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
     // Use the scalar path
-    for (int i = first_index; i < size; i += step_size) {
+    for (Index i = first_index; i < size; i += step_size) {
       eval.evalScalar(i);
     }
   }
   else {
     // Use the vector path
-    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const int vectorized_step_size = step_size * PacketSize;
-    const int vectorized_size = (size / PacketSize) * PacketSize;
-    int i = first_index * PacketSize;
+    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const Index vectorized_step_size = step_size * PacketSize;
+    const Index vectorized_size = (size / PacketSize) * PacketSize;
+    Index i = first_index * PacketSize;
     for ( ; i < vectorized_size; i += vectorized_step_size) {
       eval.evalPacket(i);
     }
@@ -193,7 +193,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable>
       const int block_size = maxCudaThreadsPerBlock();
 
       const Index size = array_prod(evaluator.dimensions());
-      EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
+      EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index><<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
       assert(cudaGetLastError() == cudaSuccess);
     }
     evaluator.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 3447592eb..33849ed3e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -459,7 +459,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
     }
     else {
-      CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 89c0cff05..d6347b054 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     for (int i = 0; i < NumDims; ++i) {
       m_dimensions[i] += m_padding[i].first + m_padding[i].second;
     }
-
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     m_inputStrides[0] = 1;
     m_outputStrides[0] = 1;
@@ -125,6 +124,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
+    eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
@@ -151,11 +151,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     const Index initialIndex = index;
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
-      const int first = index;
-      const int last = index + packetSize - 1;
-      const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
-      const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
-      const int lastPaddedRight = m_outputStrides[i+1];
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i+1];
 
       if (last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
@@ -179,9 +179,9 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index last = index + packetSize - 1;
     const Index first = index;
-    const int lastPaddedLeft = m_padding[0].first;
-    const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
-    const int lastPaddedRight = m_outputStrides[1];
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
 
     if (last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.