From 7f1c29fb0c26e92f31446926c441e13b0f6aec68 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 11 Nov 2015 15:22:50 -0800
Subject: [PATCH] Make it possible for a vectorized tensor expression to be
 executed in a CUDA kernel.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index bc06ca1f0..956672771 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -50,6 +50,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
 {
  public:
   typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
   {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
@@ -57,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
     if (needs_assign)
     {
       const Index size = array_prod(evaluator.dimensions());
-      static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
 
       for (Index i = 0; i < VectorizedSize; i += PacketSize) {