diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index b004f76ce..762cbfc3d 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -745,7 +745,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 #   endif
 
   protected:
-    Index redux_length() const
+    EIGEN_DEVICE_FUNC Index redux_length() const
     {
       return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
     }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 483297356..979d974a7 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -586,7 +586,7 @@ struct extract_data_selector {
 
 template<typename T>
 struct extract_data_selector<T,false> {
-  static typename T::Scalar* run(const T&) { return 0; }
+  EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
 };
 
 template<typename T>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 70d21292a..e6e586b7b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -123,7 +123,9 @@ class IndexMapper {
         inputIndex += idx * m_inputStrides[d];
         p -= idx * m_gpuInputStrides[d];
       }
-      inputIndex += p * m_inputStrides[NumKernelDims];
+      if (NumKernelDims < NumDims) {
+        inputIndex += p * m_inputStrides[NumKernelDims];
+      }
     } else {
       std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
@@ -147,7 +149,9 @@ class IndexMapper {
         outputIndex += idx * m_outputStrides[d];
         p -= idx * m_gpuOutputStrides[d];
       }
-      outputIndex += p * m_outputStrides[NumKernelDims];
+      if (NumKernelDims < NumDims) {
+        outputIndex += p * m_outputStrides[NumKernelDims];
+      }
     } else {
       std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
@@ -386,7 +390,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
@@ -824,7 +828,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     preloadKernel();
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -1112,9 +1116,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   }
 
  private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-
   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
   KernelArgType m_kernelArg;
diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 79f4066e9..d9d5da43d 100644
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -23,8 +23,8 @@ template <int Layout>
 void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
-  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
-  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 0, Layout> out_max;
+  Tensor<DenseIndex, 0, Layout> out_min;
   in.setRandom();
   in *= in.constant(100.0);
   in(0, 0, 0) = -1000.0;
@@ -46,8 +46,8 @@ void test_gpu_simple_argmax()
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
-  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout>, Aligned > gpu_out_min(d_out_min);
 
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
@@ -56,8 +56,8 @@ void test_gpu_simple_argmax()
   assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
-  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+  VERIFY_IS_EQUAL(out_max(), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
 
   gpuFree(d_in);
   gpuFree(d_out_max);
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 7b3fb5ab1..31baf1bb7 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -1100,9 +1100,9 @@ void test_gpu_erfc(const Scalar stddev)
 template <typename Scalar>
 void test_gpu_ndtri()
 {
-  Tensor<Scalar, 1> in_x(8);
-  Tensor<Scalar, 1> out(8);
-  Tensor<Scalar, 1> expected_out(8);
+  Tensor<Scalar, 1> in_x(9);
+  Tensor<Scalar, 1> out(9);
+  Tensor<Scalar, 1> expected_out(9);
   out.setZero();
 
   in_x(0) = Scalar(1);