[SYCL] This PR adds the minimum modifications to the Eigen unsupported module required to run it on devices supporting SYCL.

* Abstracting the pointer type so that both SYCL memory and pointer can be captured. * Converting SYCL virtual pointer to SYCL device memory in Eigen evaluator class. * Binding SYCL placeholder accessor to command group handler by using bind method in Eigen evaluator node. * Adding SYCL macro for controlling loop unrolling. * Modifying the TensorDeviceSycl.h and SYCL executor method to adopt the above changes.
2025-07-16 01:51:51 +08:00 · 2019-06-28 10:08:23 +01:00 · 2019-06-28 10:08:23 +01:00 · 7d08fa805a
commit 7d08fa805a
parent 16a56b2ddd
47 changed files with 1983 additions and 951 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
 template<typename XprType>
 struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorIndexTupleOp<XprType>& type;
+  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
 };
 template<typename XprType>
@ -82,6 +82,8 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
  static const int NumDims = internal::array_size<Dimensions>::value;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@ -100,7 +102,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
    return m_impl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -118,11 +120,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
  }
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    return m_impl;
+    m_impl.bind(cgh);
  }
 #endif
@ -154,7 +156,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
 template<typename ReduceOp, typename Dims, typename XprType>
 struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
 {
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename ReduceOp, typename Dims, typename XprType>
@ -216,6 +218,9 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
  static const int NumDims = internal::array_size<InputDimensions>::value;
  typedef array<Index, NumDims> StrideDims;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  typedef StorageMemory<TupleType, Device> TupleStorageMem;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@ -231,9 +236,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
      : m_orig_impl(op.expression(), device),
        m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
        m_return_dim(op.return_dim())
 #ifdef EIGEN_USE_SYCL
       ,m_device(device)
 #endif
  {
    gen_strides(m_orig_impl.dimensions(), m_strides);
    if (Layout == static_cast<int>(ColMajor)) {
@ -243,14 +245,17 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
    }    
-    m_stride_div = (m_return_dim >= 0) ? m_strides[m_return_dim] : 1;
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
    m_stride_div = ((m_return_dim >= 0) &&
                    (m_return_dim < static_cast<Index>(m_strides.size())))
                   ? m_strides[m_return_dim] : 1;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
    return m_impl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -263,15 +268,12 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
  }
-  #ifndef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
-  #else // following functions are required by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); }
+    m_impl.bind(cgh);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}
+    m_orig_impl.bind(cgh);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}
+  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}
  const Device& device() const{return m_device;}
 #endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@ -312,9 +314,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  StrideDims m_strides;
  Index m_stride_mod;
  Index m_stride_div;
 #ifdef EIGEN_USE_SYCL
  const Device& m_device;
 #endif
 };
 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@ -97,6 +97,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  static const int NumDims = XprType::NumDims;
@ -136,7 +138,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    return m_rightImpl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
    m_leftImpl.evalSubExprsIfNeeded(NULL);
    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@ -154,6 +156,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
@ -199,13 +202,15 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
      m_leftImpl.writeBlock(*block);
    }
  }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_leftImpl.bind(cgh);
    m_rightImpl.bind(cgh);
  }
 #endif
-  /// required by sycl in order to extract the accessor
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_leftImpl.data(); }
 private:
  TensorEvaluator<LeftArgType, Device> m_leftImpl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@ -841,7 +841,7 @@ struct TensorBlockView {
  const Scalar* data() const { return m_data; }
 private:
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  Dimensions m_block_sizes, m_block_strides;
  const Scalar* m_data;      // Not owned.
  Scalar* m_allocated_data;  // Owned.
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -37,7 +37,7 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
 template<typename Broadcast, typename XprType>
 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
 {
-  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
 };
 template<typename Broadcast, typename XprType>
@ -105,7 +105,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
  bool isCopy, nByOne, oneByN;
  public:
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned         = true,
@ -205,7 +209,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -238,6 +242,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  // TODO: attempt to speed this up. The integer divisions and modulo are slow
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
    Index inputIndex = 0;
    EIGEN_UNROLL_LOOP
    for (int i = NumDims - 1; i > 0; --i) {
      const Index idx = index / m_outputStrides[i];
      if (internal::index_statically_eq<Broadcast>(i, 1)) {
@ -272,6 +277,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
    Index inputIndex = 0;
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < NumDims - 1; ++i) {
      const Index idx = index / m_outputStrides[i];
      if (internal::index_statically_eq<Broadcast>(i, 1)) {
@ -376,6 +382,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      values[0] = m_impl.coeff(inputIndex);
      return internal::pload1<PacketReturnType>(values);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
        if (outputOffset + cur < m_outputStrides[endDim]) {
          values[i] = m_impl.coeff(inputIndex);
@ -410,6 +417,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      return m_impl.template packet<Unaligned>(inputIndex);
    } else {
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < PacketSize; ++i) {
        if (inputIndex > m_inputStrides[dim]-1) {
          inputIndex = 0;
@ -441,6 +449,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
      values[0] = m_impl.coeff(inputIndex);
      return internal::pload1<PacketReturnType>(values);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
        if (outputOffset + cur < m_outputStrides[dim]) {
          values[i] = m_impl.coeff(inputIndex);
@ -465,6 +474,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    const Index originalIndex = index;
    Index inputIndex = 0;
    EIGEN_UNROLL_LOOP
    for (int i = NumDims - 1; i > 0; --i) {
      const Index idx = index / m_outputStrides[i];
      if (internal::index_statically_eq<Broadcast>(i, 1)) {
@ -500,6 +510,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    } else {
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      values[0] = m_impl.coeff(inputIndex);
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < PacketSize; ++i) {
        if (innermostLoc + i < m_impl.dimensions()[0]) {
          values[i] = m_impl.coeff(inputIndex+i);
@ -521,6 +532,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    const Index originalIndex = index;
    Index inputIndex = 0;
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < NumDims - 1; ++i) {
      const Index idx = index / m_outputStrides[i];
      if (internal::index_statically_eq<Broadcast>(i, 1)) {
@ -556,6 +568,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    } else {
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      values[0] = m_impl.coeff(inputIndex);
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < PacketSize; ++i) {
        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
          values[i] = m_impl.coeff(inputIndex+i);
@ -572,6 +585,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  costPerCoeff(bool vectorized) const {
    double compute_cost = TensorOpCost::AddCost<Index>();
    if (!isCopy && NumDims > 0) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        compute_cost += TensorOpCost::DivCost<Index>();
        if (internal::index_statically_eq<Broadcast>(i, 1)) {
@ -845,12 +859,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
    }
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  Broadcast functor() const { return m_broadcast; }
-
+  #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
  #endif
 private:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
      const Dimensions& input_block_sizes,
@ -875,8 +894,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  }
 protected:
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
-  const Broadcast m_broadcast;
+  const typename internal::remove_reference<Broadcast>::type m_broadcast;
  Dimensions m_dimensions;
  array<Index, NumDims> m_outputStrides;
  array<Index, NumDims> m_inputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@ -38,7 +38,7 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
 template<DenseIndex DimId, typename XprType>
 struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
 {
-  typedef const TensorChippingOp<DimId, XprType>& type;
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
 };
 template<DenseIndex DimId, typename XprType>
@ -139,7 +139,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    // Alignment can't be guaranteed at compile time since it depends on the
@ -169,7 +170,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
      OutputTensorBlock;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
  {
    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
    eigen_assert(NumInputDims > m_dim.actualDim());
@ -218,7 +219,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -243,6 +244,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
      eigen_assert(m_stride == 1);
      Index inputIndex = index * m_inputStride + m_inputOffset;
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < PacketSize; ++i) {
        values[i] = m_impl.coeff(inputIndex);
        inputIndex += m_inputStride;
@ -262,6 +264,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
      } else {
        // Cross the stride boundary. Fallback to slow path.
        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       EIGEN_UNROLL_LOOP
        for (int i = 0; i < PacketSize; ++i) {
          values[i] = coeff(index);
          ++index;
@ -349,26 +352,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
    m_impl.block(&input_block);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
+    typename Storage::Type result = constCast(m_impl.data());
    if (IsOuterChipping && result) {
      return result + m_inputOffset;
    } else {
      return NULL;
    }
  }
-
+#ifdef EIGEN_USE_SYCL
-  /// used by sycl
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    return m_dim.actualDim();
+    m_impl.bind(cgh);
  }
-
+#endif
  /// used by sycl
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
    return m_offset;
  }
  /// required by sycl in order to extract the accessor
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
@ -399,10 +396,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
  DSizes<Index, NumInputDims> m_inputStrides;
  TensorEvaluator<ArgType, Device> m_impl;
  const internal::DimensionId<DimId> m_dim;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 // required by sycl
  const DenseIndex m_offset;
 };
@ -466,6 +460,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < PacketSize; ++i) {
        this->m_impl.coeffRef(inputIndex) = values[i];
        inputIndex += this->m_inputStride;
@ -484,6 +479,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
        // Cross stride boundary. Fallback to slow path.
        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
        EIGEN_UNROLL_LOOP
        for (int i = 0; i < PacketSize; ++i) {
          this->coeffRef(index) = values[i];
          ++index;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@ -119,6 +119,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
@ -181,7 +183,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
  {
    m_leftImpl.evalSubExprsIfNeeded(NULL);
    m_rightImpl.evalSubExprsIfNeeded(NULL);
@ -219,11 +221,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
      Index left_index;
      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
        left_index = subs[0];
        EIGEN_UNROLL_LOOP
        for (int i = 1; i < NumDims; ++i) {
          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
        }
      } else {
        left_index = subs[NumDims - 1];
        EIGEN_UNROLL_LOOP
        for (int i = NumDims - 2; i >= 0; --i) {
          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
        }
@ -235,11 +239,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
      Index right_index;
      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
        right_index = subs[0];
        EIGEN_UNROLL_LOOP
        for (int i = 1; i < NumDims; ++i) {
          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
        }
      } else {
        right_index = subs[NumDims - 1];
        EIGEN_UNROLL_LOOP
        for (int i = NumDims - 2; i >= 0; --i) {
          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
        }
@ -257,6 +263,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < packetSize; ++i) {
      values[i] = coeff(index+i);
    }
@ -279,13 +286,15 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
           TensorOpCost(0, 0, compute_cost);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
+  
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
+  #ifdef EIGEN_USE_SYCL
-  /// required by sycl in order to extract the accessor
+  // binding placeholder accessors to a command group handler for SYCL
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  /// required by sycl in order to extract the accessor
+    m_leftImpl.bind(cgh);
-  const Axis& axis() const { return m_axis; }
+    m_rightImpl.bind(cgh);
  }
  #endif
  protected:
    Dimensions m_dimensions;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -433,6 +433,8 @@ struct TensorContractionEvaluatorBase
  typedef typename XprType::Index Index;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = true,
@ -453,6 +455,9 @@ struct TensorContractionEvaluatorBase
  typedef typename internal::conditional<
    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
  static const int LDims =
      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
  static const int RDims =
@ -653,14 +658,14 @@ struct TensorContractionEvaluatorBase
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_leftImpl.evalSubExprsIfNeeded(NULL);
    m_rightImpl.evalSubExprsIfNeeded(NULL);
    if (data) {
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
      evalTo(m_result);
      return true;
    }
@ -934,7 +939,7 @@ struct TensorContractionEvaluatorBase
    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
@ -1169,9 +1174,9 @@ protected:
  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  OutputKernelType m_output_kernel;
-  Scalar* m_result;
+  EvaluatorPointerType m_result;
  bool m_can_use_xsmm;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@ -59,6 +59,13 @@ struct CoeffLoader {
    return m_tensor.template packet<LoadMode>(index);
  }
  #ifdef EIGEN_USE_SYCL
  // The placeholder accessors require to be bound to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_tensor.bind(cgh);
  }
  #endif
 private:
  const Tensor m_tensor;
 };
@ -87,6 +94,13 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
  {
    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
  }
  #ifdef EIGEN_USE_SYCL
  // The placeholder accessors require to be bound to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_data.bind(cgh);
  }
  #endif
 private:
  typedef typename Tensor::Scalar Scalar;
@ -139,6 +153,7 @@ class SimpleTensorContractionMapper {
    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
    Index nocontract_val = left ? row : col;
    Index linidx = 0;
    EIGEN_UNROLL_LOOP
    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
      const Index idx = nocontract_val / m_ij_strides[i];
      linidx += idx * m_nocontract_strides[i];
@ -155,6 +170,7 @@ class SimpleTensorContractionMapper {
    Index contract_val = left ? col : row;
    if(array_size<contract_t>::value > 0) {
      EIGEN_UNROLL_LOOP
      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
        const Index idx = contract_val / m_k_strides[i];
        linidx += idx * m_contract_strides[i];
@ -179,6 +195,7 @@ class SimpleTensorContractionMapper {
    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
    Index linidx[2] = {0, 0};
    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
      EIGEN_UNROLL_LOOP
      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
@ -199,6 +216,7 @@ class SimpleTensorContractionMapper {
    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
    if (array_size<contract_t>::value> 0) {
      EIGEN_UNROLL_LOOP
      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
        const Index idx0 = contract_val[0] / m_k_strides[i];
        const Index idx1 = contract_val[1] / m_k_strides[i];
@ -230,6 +248,13 @@ class SimpleTensorContractionMapper {
    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
  }
  #ifdef EIGEN_USE_SYCL
  // The placeholder accessors require to be bound to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_tensor.bind(cgh);
  }
  #endif
  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
    return m_tensor;
  }
@ -302,6 +327,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
    EIGEN_ALIGN_MAX Scalar data[packet_size];
    data[0] = this->m_tensor.coeff(first);
    EIGEN_UNROLL_LOOP
    for (Index k = 1; k < packet_size - 1; k += 2) {
      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
      data[k] = this->m_tensor.coeff(internal_pair.first);
@ -472,6 +498,13 @@ class TensorContractionSubMapper {
    return false;
  }
  #ifdef EIGEN_USE_SYCL
  // The placeholder accessors require to be bound to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_base_mapper.bind(cgh);
  }
  #endif
  const ParentMapper& base_mapper() const { return m_base_mapper; }
  Index vert_offset() const { return m_vert_offset; }
  Index horiz_offset() const { return m_horiz_offset; }
@ -515,6 +548,22 @@ class TensorContractionInputMapper
 };
 template <typename T> struct TensorContractionInputMapperTrait;
 template<typename Scalar_, typename Index_, int side_,
         typename Tensor_,
         typename nocontract_t_, typename contract_t_,
         int packet_size_,
         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
 struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
      typedef Tensor_ XprType;
      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
      static const bool  inner_dim_reordered = inner_dim_reordered_;
  };  
 }  // end namespace internal
 }  // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -129,6 +129,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
      internal::scalar_cast_op<SrcType, TgtType> converter;
      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < TgtPacketSize; ++i) {
        values[i] = converter(m_impl.coeff(index+i));
      }
@ -164,15 +165,15 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
    typename XprType::Nested m_xpr;
 };
-template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
    impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
 };
-template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
    return impl.evalSubExprsIfNeeded(data);
  }
 };
@ -207,6 +208,7 @@ struct PacketConv {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
    internal::scalar_cast_op<SrcType, TargetType> converter;
    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = converter(impl.coeff(index+i));
    }
@ -267,10 +269,18 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
-    PacketAccess = true,
+    PacketAccess =
    #ifndef EIGEN_USE_SYCL
    true,
    #else
    TensorEvaluator<ArgType, Device>::PacketAccess &
        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
    #endif
    BlockAccess = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
@ -284,9 +294,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
  {
-    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
@ -330,10 +340,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
    }
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
  /// required by sycl in order to extract the sycl accessor
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -303,6 +303,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
@ -469,7 +471,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
                                       PacketSize));
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 private:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@ -525,7 +527,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      m_local_kernel = false;
    } else {
      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
      EvalTo evalToTmp(local, m_kernelArg);
      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
@ -548,7 +550,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  KernelArgType m_kernelArg;
  const Scalar* m_kernel;
  bool m_local_kernel;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -36,7 +36,7 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename CustomUnaryFunc, typename XprType>
@ -88,7 +88,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -108,20 +110,20 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<PointerT>(
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
-          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
      evalTo(m_result);
      return true;
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result != NULL) {
+    if (m_result) {
      m_device.deallocate_temp(m_result);
      m_result = NULL;
    }
@ -141,22 +143,25 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_result.bind(cgh);
  }
 #endif
 protected:
-  EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
+  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(data, m_dimensions);
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
    m_op.func().eval(m_op.expression(), result, m_device);
  }
  Dimensions m_dimensions;
  const ArgType m_op;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
-  PointerT m_result;
+  EvaluatorPointerType m_result;
 };
@ -251,7 +256,10 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename PointerType<CoeffReturnType, Device>::Type PointerT;
+
  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -271,12 +279,13 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(PointerT data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    if (data) {
      evalTo(data);
      return false;
    } else {
-      m_result = static_cast<PointerT>(m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
      evalTo(m_result);
      return true;
    }
@ -303,22 +312,25 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC PointerT data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_result.bind(cgh);
  }
 #endif
 protected:
-  EIGEN_DEVICE_FUNC void evalTo(PointerT data) {
+  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(data, m_dimensions);
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
  }
  Dimensions m_dimensions;
  const XprType m_op;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
-  PointerT m_result;
+  EvaluatorPointerType m_result;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@ -39,6 +39,10 @@ struct DefaultDevice {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }
  template<typename Type>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
    return data;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
 #if !defined(EIGEN_GPU_COMPILE_PHASE)
@ -54,7 +58,7 @@ struct DefaultDevice {
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
    // Running on the host CPU
    return l1CacheSize();
 #elif defined(EIGEN_HIP_DEVICE_COMPILE)
@ -67,7 +71,7 @@ struct DefaultDevice {
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
    // Running single threaded on the host CPU
    return l3CacheSize();
 #elif defined(EIGEN_HIP_DEVICE_COMPILE)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -215,6 +215,10 @@ struct GpuDevice {
    stream_->deallocate(buffer);
  }
  template<typename Type>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
    return data;
  }
  EIGEN_STRONG_INLINE void* scratchpad() const {
    return stream_->scratchpad();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -76,6 +76,11 @@ struct ThreadPoolDevice {
    deallocate(buffer);
  }
  template<typename Type>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
    return data;
  }
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifdef __ANDROID__
    ::memcpy(dst, src, n);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -42,7 +42,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
    // Intermediate typedef to workaround MSVC issue.
    typedef MakePointer_<T> MakePointerT;
    typedef typename MakePointerT::Type Type;
    typedef typename MakePointerT::RefType RefType;
  };
@ -103,7 +102,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
@ -115,22 +116,16 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device),
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
          m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
  { }
  // Used for accessor extraction in SYCL Managed TensorMap:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
    return m_op;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
  }
-  typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
+
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
    EIGEN_UNUSED_VARIABLE(scalar);
    eigen_assert(scalar == NULL);
    return m_impl.evalSubExprsIfNeeded(m_buffer);
@ -165,19 +160,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
        TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
  ArgType expression() const { return m_expression; }
  #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
    m_buffer.bind(cgh);
  }
  #endif
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  /// added for sycl in order to construct the buffer from the sycl device
  const Device& device() const{return m_device;}
 private:
  TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  EvaluatorPointerType m_buffer;
  DevicePointer m_buffer;
  const XprType& m_op;
  const ArgType m_expression;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -34,6 +34,9 @@ struct TensorEvaluator
  typedef typename Derived::Dimensions Dimensions;
  typedef Derived XprType;
  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  // NumDimensions is -1 for variable dim tensors
  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@ -60,16 +63,17 @@ struct TensorEvaluator
      TensorBlockWriter;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))), 
        m_dims(m.dimensions()), 
        m_device(device)
  { }
-  // Used for accessor extraction in SYCL Managed TensorMap:
+
  const Derived& derived() const { return m_impl; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
-      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
      return false;
    }
    return true;
@ -78,14 +82,12 @@ struct TensorEvaluator
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
    return m_data[index];
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
-  typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
+    eigen_assert(m_data != NULL);
   coeffRef(Index index) {
    eigen_assert(m_data);
    return m_data[index];
  }
@ -114,7 +116,7 @@ struct TensorEvaluator
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      return m_data[m_dims.IndexOfColMajor(coords)];
    } else {
@ -122,10 +124,9 @@ struct TensorEvaluator
    }
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
  typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
  coeffRef(const array<DenseIndex, NumCoords>& coords) {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      return m_data[m_dims.IndexOfColMajor(coords)];
    } else {
@ -152,16 +153,18 @@ struct TensorEvaluator
    TensorBlockWriter::Run(block, m_data);
  }
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
  /// required by sycl in order to construct sycl buffer from raw pointer
  const Device& device() const{return m_device;}
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_data.bind(cgh);
  }
 #endif
 protected:
-  typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
  Dimensions m_dims;
-  const Device& m_device;
+  const Device m_device;
  const Derived& m_impl;
 };
 namespace {
@ -184,6 +187,13 @@ Eigen::half loadConstant(const Eigen::half* address) {
  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
 #ifdef EIGEN_USE_SYCL
 // overload of load constant should be implemented here based on range access
 template <cl::sycl::access::mode AcMd, typename T>
 T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
  return *address;
 }
 #endif
 }
@ -197,7 +207,9 @@ struct TensorEvaluator<const Derived, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename Derived::Dimensions Dimensions;
  typedef const Derived XprType;
-
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
  typedef StorageMemory<const Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  // NumDimensions is -1 for variable dim tensors
  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@ -221,18 +233,15 @@ struct TensorEvaluator<const Derived, Device>
      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
      TensorBlockReader;
  // Used for accessor extraction in SYCL Managed TensorMap:
  const Derived& derived() const { return m_impl; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
  { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
-      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
      return false;
    }
    return true;
@ -241,13 +250,8 @@ struct TensorEvaluator<const Derived, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
 #ifndef __SYCL_DEVICE_ONLY__
    return loadConstant(m_data+index);
 #else
    CoeffReturnType tmp = m_data[index];
    return tmp;
 #endif
  }
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -269,7 +273,7 @@ struct TensorEvaluator<const Derived, Device>
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
                        : m_dims.IndexOfRowMajor(coords);
    return loadConstant(m_data+index);
@ -288,16 +292,17 @@ struct TensorEvaluator<const Derived, Device>
    TensorBlockReader::Run(block, m_data);
  }
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-
+#ifdef EIGEN_USE_SYCL
-  /// added for sycl in order to construct the buffer from the sycl device
+  // binding placeholder accessors to a command group handler for SYCL
-  const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-
+    m_data.bind(cgh);
  }
 #endif
 protected:
-  typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
  Dimensions m_dims;
-  const Device& m_device;
+  const Device m_device;
  const Derived& m_impl;
 };
@ -310,16 +315,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
  enum {
    IsAligned = true,
    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
    BlockAccess = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };
  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
@ -331,10 +326,26 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = true,
    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
    #ifdef EIGEN_USE_SYCL
    &&  (PacketType<CoeffReturnType, Device>::size >1)
    #endif
    ,
    BlockAccess = false,
    PreferBlockAccess = false,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess = false,  // to be implemented
    RawAccess = false
  };
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
@ -354,13 +365,14 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
                        PacketType<CoeffReturnType, Device>::size);
  }
-  EIGEN_DEVICE_FUNC  typename Eigen::internal::traits<XprType>::PointerType  data() const { return NULL; }
+  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
  /// required by sycl in order to extract the accessor
  NullaryOp functor() const { return m_functor; }
 #ifdef EIGEN_USE_SYCL
   // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_argImpl.bind(cgh);
  }
 #endif
 private:
  const NullaryOp m_functor;
@ -401,14 +413,15 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  static const int NumDims = internal::array_size<Dimensions>::value;
  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
      TensorBlock;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_argImpl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -456,16 +469,18 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
                                                   arg_block.data());
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
+#ifdef EIGEN_USE_SYCL
-  const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  /// added for sycl in order to construct the buffer from sycl device
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
-  UnaryOp functor() const { return m_functor; }
+    m_argImpl.bind(cgh);
  }
 #endif
 private:
-  const Device& m_device;
+  const Device m_device;
  const UnaryOp m_functor;
  TensorEvaluator<ArgType, Device> m_argImpl;
 };
@ -509,6 +524,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  static const int NumDims = internal::array_size<
      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
@ -524,7 +541,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
    return m_leftImpl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_leftImpl.evalSubExprsIfNeeded(NULL);
    m_rightImpl.evalSubExprsIfNeeded(NULL);
    return true;
@ -576,16 +593,17 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
                     right_block.block_strides(), right_block.data());
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
  /// required by sycl in order to extract the accessor
  BinaryOp functor() const { return m_functor; }
  #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_leftImpl.bind(cgh);
    m_rightImpl.bind(cgh);
  }
  #endif
 private:
-  const Device& m_device;
+  const Device m_device;
  const BinaryOp m_functor;
  TensorEvaluator<LeftArgType, Device> m_leftImpl;
  TensorEvaluator<RightArgType, Device> m_rightImpl;
@ -639,6 +657,8 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
  {
@ -646,7 +666,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
    return m_arg1Impl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_arg1Impl.evalSubExprsIfNeeded(NULL);
    m_arg2Impl.evalSubExprsIfNeeded(NULL);
    m_arg3Impl.evalSubExprsIfNeeded(NULL);
@ -679,14 +699,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
+#ifdef EIGEN_USE_SYCL
-  const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
+   // binding placeholder accessors to a command group handler for SYCL
-  /// required by sycl in order to extract the accessor
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
+    m_arg1Impl.bind(cgh);
-  /// required by sycl in order to extract the accessor
+    m_arg2Impl.bind(cgh);
-  const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
+    m_arg3Impl.bind(cgh);
  }
 #endif
 private:
  const TernaryOp m_functor;
@ -731,6 +753,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
  {
@ -738,7 +762,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
    return m_condImpl.dimensions();
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_condImpl.evalSubExprsIfNeeded(NULL);
    m_thenImpl.evalSubExprsIfNeeded(NULL);
    m_elseImpl.evalSubExprsIfNeeded(NULL);
@ -758,12 +782,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
  {
     internal::Selector<PacketSize> select;
     EIGEN_UNROLL_LOOP
     for (Index i = 0; i < PacketSize; ++i) {
       select.select[i] = m_condImpl.coeff(index+i);
     }
     return internal::pblend(select,
                             m_thenImpl.template packet<LoadMode>(index),
                             m_elseImpl.template packet<LoadMode>(index));
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@ -773,14 +799,16 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
        .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
 #ifdef EIGEN_USE_SYCL
 // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_condImpl.bind(cgh);
    m_thenImpl.bind(cgh);
    m_elseImpl.bind(cgh);
  }
 #endif
 private:
  TensorEvaluator<IfArgType, Device> m_condImpl;
  TensorEvaluator<ThenArgType, Device> m_thenImpl;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -442,12 +442,133 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
-template <typename Expression, bool Vectorizable>
+template <bool Vectorizable, typename Evaluator>
-class TensorExecutor<Expression, SyclDevice, Vectorizable> {
+struct ExecExprFunctorKernel_impl {
  typedef typename Evaluator::Index Index;
  const Index range;
  const Index vectorizable_threads;
  Evaluator evaluator;
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
      const Index range_, const Index vectorizable_threads_,
      Evaluator evaluator_)
      : range(range_), vectorizable_threads(vectorizable_threads_),
        evaluator(evaluator_) {}
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
  operator()(cl::sycl::nd_item<1> itemID) {
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    Index total_threads = itemID.get_global_range(0);
    EIGEN_UNROLL_LOOP
    for (Index i = gId; i < range; i += total_threads) {
      evaluator.evalScalar(i);
    }
  }
 };
 template <typename Evaluator>
 struct ExecExprFunctorKernel_impl<true, Evaluator> {
  typedef typename Evaluator::Index Index;
  const Index range;
  const Index vectorizable_threads;
  Evaluator evaluator;
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel_impl(
      const Index range_, const Index vectorizable_threads_,
      Evaluator evaluator_)
      : range(range_), vectorizable_threads(vectorizable_threads_),
        evaluator(evaluator_) {}
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
  operator()(cl::sycl::nd_item<1> itemID) {
    Index gId = static_cast<Index>(itemID.get_global_linear_id());
    if (gId < vectorizable_threads) {
      const Index PacketSize = Eigen::internal::unpacket_traits<
          typename Evaluator::PacketReturnType>::size;
      evaluator.evalPacket(gId * PacketSize);
      gId += (vectorizable_threads * PacketSize);
      EIGEN_UNROLL_LOOP
      for (Index i = gId; i < range; i += vectorizable_threads) {
        evaluator.evalScalar(i);
      }
    }
  }
 };
 template <typename Expr, bool NonZeroVectoriseSize, typename Evaluator>
 struct ExecExprFunctorKernel
    : ExecExprFunctorKernel_impl<
          ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
          Evaluator> {
  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
                        const Evaluator &evaluator)
      : ExecExprFunctorKernel_impl<
            ::Eigen::internal::IsVectorizable<Eigen::SyclDevice, Expr>::value,
            Evaluator>(range_, vectorizable_threads_, evaluator) {}
 };
 template <typename Expr, typename Evaluator>
 struct ExecExprFunctorKernel<Expr, false, Evaluator>
    : ExecExprFunctorKernel_impl<false, Evaluator> {
  ExecExprFunctorKernel(const Index range_, const Index vectorizable_threads_,
                        const Evaluator &evaluator)
      : ExecExprFunctorKernel_impl<false, Evaluator>(
            range_, vectorizable_threads_, evaluator) {}
 };
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
  public:
-  static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) {
+  typedef typename Expression::Index Index; 
-    // call TensorSYCL module
+   static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
-    TensorSycl::run(expr, device);
+    Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> evaluator(expr, dev);
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign) {
      Index range, GRange, tileSize;
      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
      total_size = (total_size == 0) ? 1 : total_size;
      const int PacketSize = Eigen::PacketType<
          typename Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>::CoeffReturnType,
          Eigen::SyclDevice>::size;
      Index vectorizable_threads =
          static_cast<Index>(total_size / PacketSize);
      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
      range = total_size;
      auto f = [&](cl::sycl::handler &cgh) {
        evaluator.bind(cgh);
        typedef ExecExprFunctorKernel<Expression, true,
                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
            conditional_vectorized_kernel;
        typedef ExecExprFunctorKernel<Expression, false,
                                      Eigen::TensorEvaluator<Expression, Eigen::SyclDevice>>
            non_vectorized_kernel;
 // This is to make sure that an expression with a size less than vectorized size
 // will not call the vectorized kernel.
 // The reason for having this kernel is that the vectorisable parameter is a
 // compile-time parameter,
 // however, the size of a tensor is a run-time parameter
        (vectorizable_threads)
            ? cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
                  dev.program().template get_kernel<vectorized_kernel>(),
 #endif
                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                        cl::sycl::range<1>(tileSize)),
                  conditional_vectorized_kernel(range, vectorizable_threads,
                                                evaluator))
            : cgh.parallel_for(
 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
                  dev.program().template get_kernel<non_vectorized_kernel>(),
 #endif
                  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
                                        cl::sycl::range<1>(tileSize)),
                  non_vectorized_kernel(range, vectorizable_threads,
                                        evaluator));
      };
      cl::sycl::event e;
      EIGEN_SYCL_TRY_CATCH(e = dev.sycl_queue().submit(f));
      dev.async_synchronize(e);
    }
    evaluator.cleanup();
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@ -89,7 +89,10 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
  static const int NumDimensions = XprTraits::NumDimensions;
  static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType;
+  typedef typename TypeConversion<Scalar, 
                                  typename XprTraits::PointerType
                                  >::type 
                                  PointerType;
 };
 template<typename UnaryOp, typename XprType>
@ -166,7 +169,8 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
                                                      typename traits<LhsXprType>::PointerType,
                                                      typename traits<RhsXprType>::PointerType>::type
-                                  >::type PointerType;
+                                  >::type 
                                  PointerType;
  enum {
    Flags = 0
  };
@ -247,7 +251,8 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
                                                      typename traits<Arg2XprType>::PointerType,
                                                      typename traits<Arg3XprType>::PointerType>::type
-                                  >::type PointerType;
+                                  >::type 
                                  PointerType;
  enum {
    Flags = 0
  };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@ -131,6 +131,8 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
  typedef OutputScalar CoeffReturnType;
  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -167,13 +169,13 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
    return m_dimensions;
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    if (data) {
      evalToBuf(data);
      return false;
    } else {
-      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
      evalToBuf(m_data);
      return true;
    }
@ -202,11 +204,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-
+#ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_data.bind(cgh);
  }
 #endif
 private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
    ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
@ -576,12 +583,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 protected:
  Index m_size;
-  const FFT& m_fft;
+  const FFT EIGEN_DEVICE_REF m_fft;
  Dimensions m_dimensions;
  array<Index, NumDims> m_strides;
  TensorEvaluator<ArgType, Device> m_impl;
-  CoeffReturnType* m_data;
+  EvaluatorPointerType m_data;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  // This will support a maximum FFT size of 2^32 for each dimension
  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@ -78,9 +78,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
 };
-template<typename ArgType, typename Device>
+template<typename ArgType_, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 {
  typedef const typename internal::remove_all<ArgType_>::type ArgType;
  typedef TensorForcedEvalOp<ArgType> XprType;
  typedef typename ArgType::Scalar Scalar;
  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
@ -88,6 +89,9 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned         = true,
@ -106,8 +110,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
      TensorBlockReader;
  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-  /// op_ is used for sycl
+      : m_impl(op.expression(), device), m_op(op.expression()), 
-      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+      m_device(device), m_buffer(NULL)
  { }
  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
@ -115,17 +119,19 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  #if !defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC
  #endif
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType));
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
    #ifndef EIGEN_USE_SYCL
    // Should initialize the memory in case we're dealing with non POD types.
    if (NumTraits<CoeffReturnType>::RequireInitialization) {
      for (Index i = 0; i < numValues; ++i) {
        new(m_buffer+i) CoeffReturnType();
      }
    }
    #endif
    typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
-    EvalTo evalToTmp(m_buffer, m_op);
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
    const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device);
    return true;
@ -159,17 +165,20 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename Eigen::internal::traits<XprType>::PointerType  data() const { return m_buffer; }
+  EvaluatorPointerType data() const { return m_buffer; }
-  /// required by sycl in order to extract the sycl accessor
+#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+    m_buffer.bind(cgh);
    m_impl.bind(cgh);
  }
 #endif
 private:
  TensorEvaluator<ArgType, Device> m_impl;
  const ArgType m_op;
-  const Device& m_device;
+  const Device m_device;
-  CoeffReturnType* m_buffer;
+  EvaluatorPointerType m_buffer;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@ -20,17 +20,20 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
  typedef T* Type;
  typedef T& RefType;
  typedef T ScalarType;
 };
-// The PointerType class is a container of the device specefic pointer
+template <typename T>
-// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression
+EIGEN_STRONG_INLINE T* constCast(const T* data) {
  return const_cast<T*>(data);
 }
 // The StorageMemory class is a container of the device specific pointer
 // used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
 // is a device-agnostic type and need MakePointer class for type conversion,
-// the TensorEvaluator calls can be specialized for a device, hence it is possible
+// the TensorEvaluator class can be specialized for a device, hence it is possible
 // to construct different types of temproray storage memory in TensorEvaluator
-// for different devices by specializing the following PointerType class.
+// for different devices by specializing the following StorageMemory class.
-template<typename T, typename Device> struct PointerType : MakePointer<T>{};
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
 namespace internal{
 template<typename A, typename B> struct Pointer_type_promotion {
@ -39,25 +42,11 @@ template<typename A, typename B> struct Pointer_type_promotion {
 template<typename A> struct Pointer_type_promotion<A, A> {
  static const bool val = true;
 };
 template<typename A, typename B> struct TypeConversion;
 #ifndef __SYCL_DEVICE_ONLY__
 template<typename A, typename B> struct TypeConversion {
  typedef A* type;
 };
 #endif
 }
 #if defined(EIGEN_USE_SYCL)
 namespace TensorSycl {
 namespace internal{
 template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
 template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
 class FullReductionKernelFunctor;
 }
 }
 #endif
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
@ -113,6 +102,31 @@ struct ThreadPoolDevice;
 struct GpuDevice;
 struct SyclDevice;
 #ifdef EIGEN_USE_SYCL
 template <typename T> struct MakeSYCLPointer {
  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
 };
 template <typename T>
 EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
 constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
  return data;
 }
 template <typename T>
 struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
 template <typename T>
 struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
 namespace TensorSycl {
 namespace internal{
 template <typename Evaluator, typename Op> class ReductionFunctor;
 }
 }
 #endif
 enum FFTResultType {
  RealPart = 0,
  ImagPart = 1,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -421,6 +421,7 @@ class GaussianGenerator {
                                      const array<T, NumDims>& std_devs)
      : m_means(means)
  {
    EIGEN_UNROLL_LOOP
    for (size_t i = 0; i < NumDims; ++i) {
      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
    }
@ -428,6 +429,7 @@ class GaussianGenerator {
  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
    T tmp = T(0);
    EIGEN_UNROLL_LOOP
    for (size_t i = 0; i < NumDims; ++i) {
      T offset = coordinates[i] - m_means[i];
      tmp += offset * offset / m_two_sigmas[i];
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@ -88,6 +88,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned         = false,
    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
@ -105,21 +107,20 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      :  m_device(device), m_generator(op.generator())
 #ifdef EIGEN_USE_SYCL
      , m_argImpl(op.expression(), device)
 #endif
  {
    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
    m_dimensions = argImpl.dimensions();
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      m_strides[0] = 1;
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < NumDims; ++i) {
        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
      }
    } else {
      m_strides[NumDims - 1] = 1;
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 2; i >= 0; --i) {
        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
@ -129,7 +130,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@ -234,11 +235,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
                                  TensorOpCost::MulCost<Scalar>());
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType  data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
 #endif
 protected:
@ -261,14 +262,11 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
    }
  }
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  Dimensions m_dimensions;
  array<Index, NumDims> m_strides;
  array<IndexDivisor, NumDims> m_fast_strides;
  Generator m_generator;
 #ifdef EIGEN_USE_SYCL
  TensorEvaluator<ArgType, Device> m_argImpl;
 #endif
 };
 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@ -154,23 +154,6 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                           m_padding_left(padding_left), m_padding_right(padding_right),
                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 #ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
                                                         DenseIndex row_strides, DenseIndex col_strides,
                                                         DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                         DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                         bool padding_explicit, DenseIndex padding_top, DenseIndex padding_bottom,
                                                         DenseIndex padding_left, DenseIndex padding_right, PaddingType padding_type,
                                                         Scalar padding_value)
                                                         : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
                                                         m_row_strides(row_strides), m_col_strides(col_strides),
                                                         m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
                                                         m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
                                                         m_padding_explicit(padding_explicit), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
                                                         m_padding_left(padding_left), m_padding_right(padding_right),
                                                         m_padding_type(padding_type), m_padding_value(padding_value) {}
 #endif
    EIGEN_DEVICE_FUNC
    DenseIndex patch_rows() const { return m_patch_rows; }
@ -242,6 +225,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned         = false,
@ -256,15 +241,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
      OutputTensorBlock;
 #ifdef __SYCL_DEVICE_ONLY__
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
  #else
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
  #endif
      : m_device(device), m_impl(op.expression(), device)
 #ifdef EIGEN_USE_SYCL
      , m_op(op)
 #endif
  {
    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
@ -410,7 +388,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -516,13 +494,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
    return packetWithPossibleZero(index);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 #ifdef EIGEN_USE_SYCL
-  // Required by SYCL in order to construct the expression tree on the device
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
  Index rowPaddingTop() const { return m_rowPaddingTop; }
@ -693,6 +673,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = coeff(index+i);
    }
@ -744,12 +725,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
  Scalar m_paddingValue;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  TensorEvaluator<ArgType, Device> m_impl;
  #ifdef EIGEN_USE_SYCL
  // Required for SYCL in order to construct the expression tree on the device
   XprType m_op;
   #endif
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@ -353,6 +353,7 @@ namespace internal {
 template<typename FirstType, typename... OtherTypes>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
  Index result = 1;
  EIGEN_UNROLL_LOOP
  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
    result *= sizes[i];
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@ -86,6 +86,8 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@ -131,7 +133,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -146,6 +148,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
    eigen_assert(index < dimensions().TotalSize());
    *inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx = index / m_outputStrides[i];
        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@ -160,6 +163,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
      *inputIndex += index / m_strides[0];
      return true;
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx = index / m_outputStrides[i];
        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@ -195,6 +199,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = coeff(index+i);
    }
@ -215,11 +220,13 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
                        compute_cost, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@ -37,7 +37,7 @@ namespace {
  {
 #ifdef EIGEN_GPU_COMPILE_PHASE
    return __clz(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
    unsigned long index;
@ -55,8 +55,8 @@ namespace {
  {
 #ifdef EIGEN_GPU_COMPILE_PHASE
    return __clzll(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::clz(val);
+    return static_cast<int>(cl::sycl::clz(val));
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
    unsigned long index;
    _BitScanReverse64(&index, val);
@ -92,7 +92,7 @@ namespace {
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
    return __umulhi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
    return (static_cast<uint64_t>(a) * b) >> 32;
@ -103,7 +103,7 @@ namespace {
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
    return __umul64hi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
 #elif defined(__SIZEOF_INT128__)
    __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
@ -124,7 +124,7 @@ namespace {
  template <typename T>
  struct DividerHelper<64, T> {
    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
      return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
      const uint64_t shift = 1ULL << log_div;
@ -205,8 +205,8 @@ class TensorIntDivisor<int32_t, true> {
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
 #ifdef EIGEN_GPU_COMPILE_PHASE
    return (__umulhi(magic, n) >> shift);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#elif defined(SYCL_DEVICE_ONLY)
-    return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
 #else
    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
    return (static_cast<uint32_t>(v >> 32) >> shift);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@ -134,13 +134,22 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
    }
  }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    return m_impl.evalSubExprsIfNeeded(data);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@ -162,7 +171,9 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
    return m_impl.costPerCoeff(vectorized);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
    return constCast(m_impl.data());
  }
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@ -59,4 +59,39 @@
 #define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
 #endif
 // Define a macro to use a reference on the host but a value on the device
 #if defined(SYCL_DEVICE_ONLY)
  #define EIGEN_DEVICE_REF
 #else
  #define EIGEN_DEVICE_REF &
 #endif
 // Define a macro for catching SYCL exceptions if exceptions are enabled
 #if defined(EIGEN_EXCEPTIONS)
  #define EIGEN_SYCL_TRY_CATCH(X) \
    do { \
      try { X; } \
      catch(const cl::sycl::exception& e) { \
        std::cerr << "SYCL exception at " \
                  << __FILE__ << ":" << __LINE__ << std::endl \
                  << e.what() << std::endl; \
        std::rethrow_exception(std::current_exception()); \
      } \
    } while (false)
 #else
  #define EIGEN_SYCL_TRY_CATCH(X) X
 #endif
 // Define a macro if local memory flags are unset or one of them is set
 // Setting both flags is the same as unsetting them
 #if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
 #elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
 #endif
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@ -31,7 +31,11 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
  public:
    typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
    typedef typename PlainObjectType::Base Base;
  #ifdef EIGEN_USE_SYCL
    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
  #else
     typedef typename Eigen::internal::nested<Self>::type Nested;
  #endif
   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
    typedef typename internal::traits<PlainObjectType>::Index Index;
    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@ -85,9 +85,57 @@ struct PacketType<half, GpuDevice> {
 #endif
 #if defined(EIGEN_USE_SYCL)
-template <typename T>
+
-  struct PacketType<T, SyclDevice> {
+namespace TensorSycl {
-  typedef T type;
+namespace internal {
 template <typename Index, Index A, Index B> struct PlusOp {
  static constexpr Index Value = A + B;
 };
 template <typename Index, Index A, Index B> struct DivOp {
  static constexpr Index Value = A / B;
 };
 template <typename Index, Index start, Index end, Index step,
          template <class Indx, Indx...> class StepOp>
 struct static_for {
  template <typename UnaryOperator>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
    op(start);
    static_for<Index, StepOp<Index, start, step>::Value, end, step,
               StepOp>::loop(op);
  }
 };
 template <typename Index, Index end, Index step,
          template <class Indx, Indx...> class StepOp>
 struct static_for<Index, end, end, step, StepOp> {
  template <typename UnaryOperator>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
 };
 template <typename OutScalar, typename Device, bool Vectorizable>
 struct Vectorise {
  static const int PacketSize = 1;
  typedef OutScalar PacketReturnType;
 };
 template <typename OutScalar, typename Device>
 struct Vectorise<OutScalar, Device, true> {
  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
 };
 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
  return ((((x) + (y)-1) / (y)) * (y));
 }
 } // namespace internal
 } // namespace TensorSycl
 template <>
  struct PacketType<half, SyclDevice> {
  typedef half type;
  static const int size = 1;
  enum {
    HasAdd    = 0,
@ -104,8 +152,58 @@ template <typename T>
    HasBlend  = 0
  };
 };
-#endif
+template <typename Scalar>
 struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
  typedef Scalar type;
  typedef Scalar half;
  enum {
    Vectorizable = 0,
    size = 1,
    AlignedOnScalar = 0,
    HasHalfPacket = 0
  };
  enum {
    HasAdd    = 0,
    HasSub    = 0,
    HasMul    = 0,
    HasNegate = 0,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasConj   = 0,
    HasSetLinear = 0
  };
 };
 template <typename Scalar>
 struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
 #ifndef EIGEN_DONT_VECTORIZE_SYCL
 #define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
 template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
 {\
  typedef typename internal::packet_traits<Type>::type type;\
  typedef typename internal::packet_traits<Type>::half half;\
 };
 PACKET_TYPE(const, float, 1, 4, SyclDevice)
 PACKET_TYPE(, float, 1, 4, SyclDevice)
 PACKET_TYPE(const, float, 1, 4, const SyclDevice)
 PACKET_TYPE(, float, 1, 4, const SyclDevice)
 PACKET_TYPE(const, double, 0, 2, SyclDevice)
 PACKET_TYPE(, double, 0, 2, SyclDevice)
 PACKET_TYPE(const, double, 0, 2, const SyclDevice)
 PACKET_TYPE(, double, 0, 2, const SyclDevice)
 #undef PACKET_TYPE
 template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
 template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
 #endif
 #endif
 // Tuple mimics std::pair but works on e.g. nvcc.
 template <typename U, typename V> struct Tuple {
@ -124,7 +222,7 @@ template <typename U, typename V> struct Tuple {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Tuple& operator= (const Tuple& rhs) {
-  #ifndef __SYCL_DEVICE_ONLY__
+  #ifndef SYCL_DEVICE_ONLY
    if (&rhs == this) return *this;
  #endif
    first = rhs.first;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -37,7 +37,7 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
 template<typename NewDimensions, typename XprType>
 struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
 {
-  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename NewDimensions, typename XprType>
@ -106,6 +106,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
  static const int NumOutputDims = internal::array_size<Dimensions>::value;
  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@ -168,7 +171,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    return m_impl.evalSubExprsIfNeeded(data);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@ -326,10 +329,18 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
    }
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return const_cast<Scalar*>(m_impl.data()); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
    return constCast(m_impl.data());
  }
  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
  #endif
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
  NewDimensions m_dimensions;
@ -404,7 +415,7 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
 template<typename StartIndices, typename Sizes, typename XprType>
 struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
 {
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename StartIndices, typename Sizes, typename XprType>
@ -488,7 +499,7 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
 // It is very expensive to start the memcpy kernel on GPU: we therefore only
 // use it for large copies.
 #ifdef EIGEN_USE_SYCL
-template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice>  {
+template <typename Index> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice>  {
  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
 };
@ -508,6 +519,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef Sizes Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    // Alignment can't be guaranteed at compile time since it depends on the
@ -575,7 +589,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
        && data && m_impl.data()
@ -599,10 +613,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
      // Use memcpy if it's going to be faster than using the regular evaluation.
      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
      if (trigger(contiguous_values)) {
-        Scalar* src = (Scalar*)m_impl.data();
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
          Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
        }
        return false;
      }
@ -637,6 +651,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + packetSize - 1};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx0 = indices[0] / m_fastOutputStrides[i];
        const Index idx1 = indices[1] / m_fastOutputStrides[i];
@ -648,6 +663,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
      inputIndices[0] += (indices[0] + m_offsets[0]);
      inputIndices[1] += (indices[1] + m_offsets[0]);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx0 = indices[0] / m_fastOutputStrides[i];
        const Index idx1 = indices[1] / m_fastOutputStrides[i];
@ -667,6 +683,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
      values[0] = m_impl.coeff(inputIndices[0]);
      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < packetSize-1; ++i) {
        values[i] = coeff(index+i);
      }
@ -698,8 +715,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    m_impl.block(&input_block);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    Scalar* result = const_cast<Scalar*>(m_impl.data());
+    typename Storage::Type result = constCast(m_impl.data());
    if (result) {
      Index offset = 0;
      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@ -733,19 +750,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    }
    return NULL;
  }
-  /// used by sycl
+#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{
+  // binding placeholder accessors to a command group handler for SYCL
-    return m_impl;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  }
+    m_impl.bind(cgh);
  /// used by sycl
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{
    return m_offsets;
  }
 #endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx = index / m_fastOutputStrides[i];
        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@ -753,6 +770,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
      }
      inputIndex += (index + m_offsets[0]);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx = index / m_fastOutputStrides[i];
        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@ -767,7 +785,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
  array<Index, NumDims> m_inputStrides;
  TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  Dimensions m_dimensions;
  bool m_is_identity;
  const StartIndices m_offsets;
@ -829,6 +847,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + packetSize - 1};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@ -840,6 +859,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
      inputIndices[0] += (indices[0] + this->m_offsets[0]);
      inputIndices[1] += (indices[1] + this->m_offsets[0]);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@ -859,6 +879,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
      this->m_impl.coeffRef(inputIndices[0]) = values[0];
      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < packetSize-1; ++i) {
        this->coeffRef(index+i) = values[i];
      }
@ -892,7 +913,7 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@ -969,6 +990,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  typedef Strides Dimensions;
  enum {
@ -985,8 +1008,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device),
        m_device(device),
-        m_strides(op.strides()), m_exprStartIndices(op.startIndices()),
+        m_strides(op.strides())
        m_exprStopIndices(op.stopIndices())
  {
    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
@ -1069,7 +1091,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -1091,30 +1113,28 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
    return NULL;
  }
-
+#ifdef EIGEN_USE_SYCL
-  //use by sycl
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  //use by sycl
+    m_impl.bind(cgh);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  const StartIndices& exprStopIndices() const { return m_exprStopIndices; }
+  }
-  //use by sycl
+#endif
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  const StartIndices& strides() const { return m_strides; }
  /// used by sycl
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i >= 0; --i) {
        const Index idx = index / m_fastOutputStrides[i];
        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
        index -= idx * m_outputStrides[i];
      }
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims; ++i) {
        const Index idx = index / m_fastOutputStrides[i];
        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@ -1125,7 +1145,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef __SYCL_DEVICE_ONLY__
+#ifndef SYCL_DEVICE_ONLY
    return numext::maxi(min, numext::mini(max,value));
 #else
    return cl::sycl::clamp(value, min, max);
@ -1137,15 +1157,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
  array<Index, NumDims> m_inputStrides;
  bool m_is_identity;
  TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  DSizes<Index, NumDims> m_startIndices; // clamped startIndices
  DSizes<Index, NumDims> m_dimensions;
  DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
  const Strides m_strides;
  //use by sycl
  const StartIndices m_exprStartIndices;
  //use by sycl
  const StopIndices m_exprStopIndices;
 };
 // Eval as lvalue
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@ -92,6 +92,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = true,
@ -138,7 +140,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -151,6 +153,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    eigen_assert(index < dimensions().TotalSize());
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx = index / m_outputStrides[i];
        if (isPaddingAtIndexForDim(idx, i)) {
@ -164,6 +167,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
      }
      inputIndex += (index - m_padding[0].first);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx = index / m_outputStrides[i+1];
        if (isPaddingAtIndexForDim(idx, i)) {
@ -192,23 +196,25 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims; ++i)
        updateCostPerDimension(cost, i, i == 0);
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i >= 0; --i)
        updateCostPerDimension(cost, i, i == NumDims - 1);
    }
    return cost;
  }
-  EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
-  /// used by sycl
+#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
+  // binding placeholder accessors to a command group handler for SYCL
-  /// used by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; }
+    m_impl.bind(cgh);
-  /// used by sycl
+  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
+#endif
 private:
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
@ -272,6 +278,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    const Index initialIndex = index;
    Index inputIndex = 0;
    EIGEN_UNROLL_LOOP
    for (int i = NumDims - 1; i > 0; --i) {
      const Index firstIdx = index;
      const Index lastIdx = index + PacketSize - 1;
@ -329,7 +336,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
    const Index initialIndex = index;
    Index inputIndex = 0;
-
+    EIGEN_UNROLL_LOOP
    for (int i = 0; i < NumDims - 1; ++i) {
      const Index firstIdx = index;
      const Index lastIdx = index + PacketSize - 1;
@ -383,6 +390,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = coeff(index+i);
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@ -89,6 +89,8 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
@ -103,9 +105,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
      : m_impl(op.expression(), device)
 #ifdef EIGEN_USE_SYCL
      , m_patch_dims(op.patch_dims())
 #endif
  {
    Index num_patches = 1;
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@ -149,7 +148,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -167,6 +166,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 2; i > 0; --i) {
        const Index patchIdx = patchIndex / m_patchStrides[i];
        patchIndex -= patchIdx * m_patchStrides[i];
@ -175,6 +175,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
      }
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 2; ++i) {
        const Index patchIdx = patchIndex / m_patchStrides[i];
        patchIndex -= patchIdx * m_patchStrides[i];
@ -202,6 +203,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
    Index inputIndices[2] = {0, 0};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 2; i > 0; --i) {
        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                   patchIndices[1] / m_patchStrides[i]};
@ -217,6 +219,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
      }
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 2; ++i) {
        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                   patchIndices[1] / m_patchStrides[i]};
@ -243,6 +246,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
      values[0] = m_impl.coeff(inputIndices[0]);
      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < PacketSize-1; ++i) {
        values[i] = coeff(index+i);
      }
@ -259,11 +263,13 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 #ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh); 
  }
 #endif
 protected:
@ -274,9 +280,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  TensorEvaluator<ArgType, Device> m_impl;
 #ifdef EIGEN_USE_SYCL
  const PatchDim m_patch_dims;
 #endif
 };
 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -44,6 +45,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
  uint64_t rnd = ::random() ^ mach_absolute_time();
  return rnd;
 #else
  // Augment the current time with pseudo random number generation
  // to ensure that we get different seeds if we try to generate seeds
@ -147,14 +149,41 @@ template <typename T> class UniformRandomGenerator {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
    // Therefor, we need two step to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is 
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread 
    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds  
    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction 
    // similar to CUDA Therefore, the thread Id injection is not available at this stage. 
    //However when the operator() is called the thread ID will be avilable. So inside the opeator, 
    // we add the thrreadID, BlockId,... (which is equivalent of i) 
    //to the seed and construct the unique m_state per thead similar to cuda.  
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
      const UniformRandomGenerator& other) {
    m_state = other.m_state;
    #ifdef EIGEN_USE_SYCL
     m_exec_once =other.m_exec_once;
    #endif
  }
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    T result = RandomToTypeUniform<T>(&m_state, i);
    return result;
  }
@ -163,6 +192,14 @@ template <typename T> class UniformRandomGenerator {
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
      #ifdef EIGEN_USE_SYCL
      if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once =true;
      }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeUniform<T>(&m_state, i);
    }
@ -171,6 +208,9 @@ template <typename T> class UniformRandomGenerator {
 private:
  mutable uint64_t m_state;
  #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
 };
 template <typename Scalar>
@ -222,14 +262,37 @@ template <typename T> class NormalRandomGenerator {
  // Uses the given "seed" if non-zero, otherwise uses a random seed.
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
    m_state = PCG_XSH_RS_state(seed);
    #ifdef EIGEN_USE_SYCL
    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
    // Therefor, we need two steps to initializate the m_state.
    // IN SYCL, the constructor of the functor is s called on the CPU
    // and we get the clock seed here from the CPU. However, This seed is 
    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
    // and only  available on the Operator() function (which is called on the GPU).
    // Therefore, the thread Id injection is not available at this stage. However when the operator() 
    //is called the thread ID will be avilable. So inside the opeator, 
    // we add the thrreadID, BlockId,... (which is equivalent of i) 
    //to the seed and construct the unique m_state per thead similar to cuda.  
    m_exec_once =false;
   #endif
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
      const NormalRandomGenerator& other) {
    m_state = other.m_state;
 #ifdef EIGEN_USE_SYCL
    m_exec_once=other.m_exec_once;
 #endif
  }
 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  T operator()(Index i) const {
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    T result = RandomToTypeNormal<T>(&m_state, i);
    return result;
  }
@ -238,6 +301,14 @@ template <typename T> class NormalRandomGenerator {
  Packet packetOp(Index i) const {
    const int packetSize = internal::unpacket_traits<Packet>::size;
    EIGEN_ALIGN_MAX T values[packetSize];
    #ifdef EIGEN_USE_SYCL
    if(!m_exec_once) {
      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
      m_state += (i * 6364136223846793005ULL);
      m_exec_once =true;
    }
    #endif
    EIGEN_UNROLL_LOOP
    for (int j = 0; j < packetSize; ++j) {
      values[j] = RandomToTypeNormal<T>(&m_state, i);
    }
@ -246,6 +317,9 @@ template <typename T> class NormalRandomGenerator {
 private:
  mutable uint64_t m_state;
   #ifdef EIGEN_USE_SYCL
  mutable bool m_exec_once;
  #endif
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -299,7 +299,7 @@ template <typename Self, typename Op, typename Device, bool Vectorizable = (Self
 struct FullReducer {
  static const bool HasOptimizedImplementation = false;
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
  }
@ -400,6 +400,18 @@ struct OuterReducer {
  }
 };
 #ifdef EIGEN_USE_SYCL
 // Default Generic reducer
 template <typename Self, typename Op, typename Device>
 struct GenericReducer {
  static const bool HasOptimizedImplementation = false;
  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
    eigen_assert(false && "Not implemented");
    return true;
  }
 };
 #endif
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I_>
@ -423,6 +435,23 @@ template <int NPT, typename S, typename R, typename I_>
 __global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 /**
 * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
 * This allows the reduction to have a different type for the accumulator than the input data type.
 * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
 * with the accumulator and the other for reducing two accumulators.
 * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
 * some properties of the input.
 */
 template <typename Op, typename CoeffReturnType>
 struct ReductionReturnType {
 #if EIGEN_HAS_CXX11 && defined(EIGEN_USE_SYCL)
  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
 #else
  typedef typename remove_const<CoeffReturnType>::type type;
 #endif
 };
 template <typename Self, typename Op,
          bool Vectorizable =
              (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
@ -520,12 +549,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
    const Op m_reducer;
 };
 template<typename ArgType, typename Device>
 struct TensorReductionEvaluatorBase;
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
  typedef internal::reducer_traits<Op, Device> ReducerTraits;
  typedef Dims ReducedDims;
  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
  typedef typename XprType::Index Index;
  typedef ArgType ChildType;
@ -535,12 +567,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  static const int NumOutputDims = NumInputDims - NumReducedDims;
  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
  typedef typename XprType::Scalar Scalar;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
    // Subset of strides of the input tensor for the non-reduced dimensions.
  // Indexed by output dimensions.
  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
  enum {
    IsAligned = false,
    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
@ -562,11 +602,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
  static const bool RunningFullReduction = (NumOutputDims==0);
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
 #if defined(EIGEN_USE_SYCL)
      , m_xpr_dims(op.dims())
 #endif
  {
    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@ -653,7 +690,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
    // of which will eventually result in an NVCC error
    EIGEN_DEVICE_FUNC
    #endif
-    bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+    bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    // Use the FullReducer if possible.
@ -663,7 +700,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         !RunningOnGPU))) {
      bool need_assign = false;
      if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType)));
+        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
        data = m_result;
        need_assign = true;
      }
@ -671,20 +708,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
      return need_assign;
    }
    else if(RunningOnSycl){
      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
      if (!data) {
        data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
        m_result = data;
      }
      Op reducer(m_reducer);
      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
      return (m_result != NULL);
    }
    // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
      bool reducing_inner_dims = true;
      for (int i = 0; i < NumReducedDims; ++i) {
        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@ -698,8 +724,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
        if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
            m_result = data;
          }
          else {
@ -707,6 +733,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          }
        }
        Op reducer(m_reducer);
        // For SYCL this if always return false
        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
          if (m_result) {
            m_device.deallocate_temp(m_result);
@ -731,8 +758,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
        if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
            m_result = data;
          }
          else {
@ -740,6 +767,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          }
        }
        Op reducer(m_reducer);
        // For SYCL this if always return false
        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
          if (m_result) {
            m_device.deallocate_temp(m_result);
@ -750,6 +778,21 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          return (m_result != NULL);
        }
      }
      #if defined(EIGEN_USE_SYCL)
      // If there is no Optimised version for SYCL, the reduction expression 
      // must break into two subexpression and use the SYCL generic Reducer on the device.
      if(RunningOnSycl) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
           m_result = data;
         }
         Op reducer(m_reducer);
         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
         return (m_result != NULL);
       }
      #endif
    }
    return true;
  }
@ -764,7 +807,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
-    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
      return *(m_result + index);
    }
    Op reducer(m_reducer);
@ -1097,12 +1140,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
    m_device.deallocate(reducers);
  }
-  EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-#if defined(EIGEN_USE_SYCL)
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
-  const Device& device() const { return m_device; }
+  // binding placeholder accessors to a command group handler for SYCL
-  const Dims& xprDims() const { return m_xpr_dims; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
    m_result.bind(cgh);
  }
 #endif
  private:
@ -1126,8 +1172,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 #endif
 #if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
 template <typename, typename, typename> friend struct internal::GenericReducer;
 #endif
@ -1255,9 +1302,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
  // Precomputed strides for the output tensor.
  array<Index, NumOutputDims> m_outputStrides;
  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
  // Subset of strides of the input tensor for the non-reduced dimensions.
  // Indexed by output dimensions.
  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
  array<Index, NumPreservedStrides> m_preservedStrides;
  // Map from output to input dimension index.
  array<Index, NumOutputDims> m_output_to_input_dim_map;
@ -1288,13 +1332,36 @@ static const bool RunningOnGPU = false;
  static const bool RunningOnGPU = false;
  static const bool RunningOnSycl = false;
 #endif
-  typename MakePointer_<CoeffReturnType>::Type m_result;
+  EvaluatorPointerType m_result;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
-#if defined(EIGEN_USE_SYCL)
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-  const Dims m_xpr_dims;
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
-#endif
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
 };
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
  //Therefore the coeff function should be overridden by for SYCL kernel
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
    return *(this->data() + index);
  }
  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
  //Therefore the packet function should be overridden by for SYCL kernel
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
  }
 };
 } // end namespace Eigen
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t
 public:
  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  typedef  TensorEvaluator<Expr, Device> EvalType;
  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
    m_dims = m_impl.dimensions();
@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension
 public:
  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
  typedef typename Base::Scalar Scalar;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
  }
@ -362,6 +367,8 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
  typedef typename Derived::Scalar CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef typename Derived::Dimensions Dimensions;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -379,7 +386,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    return true;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -145,7 +147,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -158,6 +160,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
    eigen_assert(index < dimensions().TotalSize());
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        Index idx = index / m_strides[i];
        index -= idx * m_strides[i];
@ -172,6 +175,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
        inputIndex += index;
      }
    } else {
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        Index idx = index / m_strides[i];
        index -= idx * m_strides[i];
@ -205,6 +209,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
    // local structure.
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
                                                            values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = coeff(index+i);
    }
@ -225,12 +230,14 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
+#ifdef EIGEN_USE_SYCL
-  const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
+  // binding placeholder accessors to a command group handler for SYCL
-  /// added for sycl in order to construct the buffer from sycl device
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-  ReverseDimensions functor() const { return m_reverse; }
+    m_impl.bind(cgh);
  }
 #endif
 protected:
  Dimensions m_dimensions;
@ -285,11 +292,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
    // This code is pilfered from TensorMorphing.h
    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      this->coeffRef(index+i) = values[i];
    }
  }
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -86,12 +86,15 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
  typedef TensorScanOp<Op, ArgType> XprType;
  typedef typename XprType::Index Index;
  typedef const ArgType ChildType;
  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
  typedef DSizes<Index, NumDims> Dimensions;
  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
  typedef StorageMemory<Scalar, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -110,7 +113,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
        m_exclusive(op.exclusive()),
        m_accumulator(op.accumulator()),
        m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1),
+        m_stride(1), m_consume_dim(op.axis()),
        m_output(NULL) {
    // Accumulating a scalar isn't supported.
@ -142,6 +145,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    return m_stride;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
    return m_consume_dim;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
    return m_size;
  }
@ -162,7 +169,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    return m_device;
  }
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    ScanLauncher<Self, Op, Device> launcher;
    if (data) {
@ -171,7 +178,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    }
    const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
    launcher(*this, m_output);
    return true;
  }
@ -181,7 +188,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
  {
    return m_output;
  }
@ -196,21 +203,29 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output != NULL) {
+    if (m_output) {
-      m_device.deallocate(m_output);
+      m_device.deallocate_temp(m_output);
      m_output = NULL;
    }
    m_impl.cleanup();
  }
 #ifdef EIGEN_USE_SYCL
 // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
    m_output.bind(cgh);
  }
 #endif
 protected:
  TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  const bool m_exclusive;
  Op m_accumulator;
  const Index m_size;
  Index m_stride;
-  CoeffReturnType* m_output;
+  Index m_consume_dim;
  EvaluatorPointerType m_output;
 };
 // CPU implementation of scan
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@ -109,6 +109,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned         = false,
@ -130,8 +132,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                        const Device& device)
      : m_device(device),
-        m_impl(op.expression(), device),
+        m_impl(op.expression(), device)
        m_shuffle(op.shufflePermutation())
  {
    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    const Shuffle& shuffle = op.shufflePermutation();
@ -172,7 +173,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -194,6 +195,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    static PacketReturnType Run(const Self& self, Index index) {
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < PacketSize; ++i) {
        values[i] = self.coeff(index + i);
      }
@ -210,6 +212,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
        return self.m_impl.template packet<LoadMode>(index);
      } else {
        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
        EIGEN_UNROLL_LOOP
        for (int i = 0; i < PacketSize; ++i) {
          values[i] = self.coeff(index + i);
        }
@ -330,13 +333,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
  // required by sycl
  EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
  // required by sycl
  EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
 #ifdef EIGEN_USE_SYCL
   // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
      Index input_index,
@ -389,10 +393,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
  array<Index, NumDims> m_inputStrides;
  array<Index, NumDims> m_unshuffledInputStrides;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  TensorEvaluator<ArgType, Device> m_impl;
  /// required by sycl
  Shuffle m_shuffle;
 };
@ -444,6 +446,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      this->coeffRef(index+i) = values[i];
    }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@ -37,7 +37,7 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
 template<typename Strides, typename XprType>
 struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingOp<Strides, XprType>& type;
+  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
 };
 template<typename Strides, typename XprType>
@ -108,6 +108,8 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
@ -120,7 +122,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_strides(op.strides())
+      : m_impl(op.expression(), device)
  {
    m_dimensions = m_impl.dimensions();
    for (int i = 0; i < NumDims; ++i) {
@ -149,9 +151,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -173,6 +176,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + PacketSize - 1};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx0 = indices[0] / m_outputStrides[i];
        const Index idx1 = indices[1] / m_outputStrides[i];
@ -184,6 +188,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
      inputIndices[0] += indices[0] * m_inputStrides[0];
      inputIndices[1] += indices[1] * m_inputStrides[0];
    } else {  // RowMajor
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx0 = indices[0] / m_outputStrides[i];
        const Index idx1 = indices[1] / m_outputStrides[i];
@ -203,6 +208,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
      values[0] = m_impl.coeff(inputIndices[0]);
      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < PacketSize-1; ++i) {
        values[i] = coeff(index+i);
      }
@ -225,18 +231,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
  /// required by sycl in order to extract the accessor
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
  /// required by sycl in order to extract the accessor
  Strides functor() const { return m_strides; }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
  {
    Index inputIndex = 0;
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx = index / m_outputStrides[i];
        inputIndex += idx * m_inputStrides[i];
@ -244,6 +252,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
      }
      inputIndex += index * m_inputStrides[0];
    } else {  // RowMajor
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx = index / m_outputStrides[i];
        inputIndex += idx * m_inputStrides[i];
@ -258,7 +267,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
  array<Index, NumDims> m_outputStrides;
  array<Index, NumDims> m_inputStrides;
  TensorEvaluator<ArgType, Device> m_impl;
  const Strides m_strides;
 };
 // Eval as lvalue
@ -296,11 +304,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
    return this->m_impl.coeffRef(this->srcCoeff(index));
  }
  /// required by sycl in order to extract the accessor
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
  /// required by sycl in order to extract the accessor
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketReturnType& x)
  {
@ -310,6 +313,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
    Index inputIndices[] = {0, 0};
    Index indices[] = {index, index + PacketSize - 1};
    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
      EIGEN_UNROLL_LOOP
      for (int i = NumDims - 1; i > 0; --i) {
        const Index idx0 = indices[0] / this->m_outputStrides[i];
        const Index idx1 = indices[1] / this->m_outputStrides[i];
@ -321,6 +325,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
      inputIndices[0] += indices[0] * this->m_inputStrides[0];
      inputIndices[1] += indices[1] * this->m_inputStrides[0];
    } else {  // RowMajor
      EIGEN_UNROLL_LOOP
      for (int i = 0; i < NumDims - 1; ++i) {
        const Index idx0 = indices[0] / this->m_outputStrides[i];
        const Index idx1 = indices[1] / this->m_outputStrides[i];
@ -340,6 +345,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
      internal::pstore<Scalar, PacketReturnType>(values, x);
      this->m_impl.coeffRef(inputIndices[0]) = values[0];
      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
      EIGEN_UNROLL_LOOP
      for (int i = 1; i < PacketSize-1; ++i) {
        this->coeffRef(index+i) = values[i];
      }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@ -91,6 +91,8 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -205,7 +207,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
    return m_dimensions;
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -249,6 +251,13 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
    return result;
  }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
  // Given the output index, finds the first index in the input tensor used to compute the trace
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@ -276,7 +285,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
  TensorEvaluator<ArgType, Device> m_impl;
  // Initialize the size of the trace dimension
  Index m_traceDim;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
  array<bool, NumInputDims> m_reduced;
  array<Index, NumReducedDims> m_reducedDims;
  array<Index, NumOutputDims> m_outputStrides;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@ -58,9 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
  };
  template <typename T> struct MakePointer {
    typedef T* Type;
    typedef T& RefType;
    typedef T ScalarType;
  };
  typedef typename MakePointer<Scalar>::Type PointerType;
 };
@ -80,9 +77,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
  };
  template <typename T> struct MakePointer {
    typedef T* Type;
    typedef T& RefType;
    typedef T ScalarType;
  };
  typedef typename MakePointer<Scalar>::Type PointerType;
 };
@ -106,10 +100,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
    // Intermediate typedef to workaround MSVC issue.
    typedef MakePointer_<T> MakePointerT;
    typedef typename MakePointerT::Type Type;
    typedef typename MakePointerT::RefType RefType;
    typedef typename MakePointerT::ScalarType ScalarType;
  };
  typedef typename MakePointer<Scalar>::Type PointerType;
 };
@ -135,49 +125,49 @@ struct traits<TensorRef<PlainObjectType> >
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 template<typename PlainObjectType>
 struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 template<typename PlainObjectType>
 struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
@ -189,50 +179,50 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 template <typename PlainObjectType, int Options, template <class> class MakePointer>
 struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 template <typename PlainObjectType, int Options, template <class> class MakePointer>
 struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 template <typename PlainObjectType>
 struct nested<TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 template <typename PlainObjectType>
 struct nested<const TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 }  // end namespace internal
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@ -91,24 +91,6 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                           m_padding_left(padding_left), m_padding_right(padding_right),
                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 #ifdef EIGEN_USE_SYCL // this is work around for sycl as Eigen could not use c++11 deligate constructor feature
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                         DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
                                                         DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                         DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                         bool padding_explicit, DenseIndex padding_top_z, DenseIndex padding_bottom_z,
                                                         DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left,
                                                         DenseIndex padding_right, PaddingType padding_type, Scalar padding_value)
                                                         : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
                                                         m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
                                                         m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
                                                         m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides),
                                                         m_col_inflate_strides(col_inflate_strides), m_padding_explicit(padding_explicit), m_padding_top_z(padding_top_z),
                                                         m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), m_padding_left(padding_left),
                                                         m_padding_right(padding_right), m_padding_type(padding_type), m_padding_value(padding_value) {}
 #endif
    EIGEN_DEVICE_FUNC
    DenseIndex patch_planes() const { return m_patch_planes; }
    EIGEN_DEVICE_FUNC
@ -195,6 +177,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
  typedef StorageMemory<CoeffReturnType, Device> Storage;
  typedef typename Storage::Type EvaluatorPointerType;
  enum {
    IsAligned = false,
@ -205,16 +189,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
    CoordAccess = false,
    RawAccess = false
  };
 #ifdef __SYCL_DEVICE_ONLY__
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
 #else
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
 #endif
-      : m_impl(op.expression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
-#ifdef EIGEN_USE_SYCL
+ m_impl(op.expression(), device)
      , m_op(op)
 #endif
  {
    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
@ -368,9 +345,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
    }
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    m_impl.evalSubExprsIfNeeded(NULL);
    return true;
  }
@ -531,14 +509,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
  }
-  EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 #ifdef EIGEN_USE_SYCL
  // Required by SYCL in order to construct the expression on the device
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; }
 #endif
  Index planePaddingTop() const { return m_planePaddingTop; }
  Index rowPaddingTop() const { return m_rowPaddingTop; }
@ -556,10 +530,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  Index rowInflateStride() const { return m_row_inflate_strides; }
  Index colInflateStride() const { return m_col_inflate_strides; }
 #ifdef EIGEN_USE_SYCL
  // binding placeholder accessors to a command group handler for SYCL
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    m_impl.bind(cgh);
  }
 #endif
 protected:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
  {
    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    EIGEN_UNROLL_LOOP
    for (int i = 0; i < PacketSize; ++i) {
      values[i] = coeff(index+i);
    }
@ -635,10 +616,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  TensorEvaluator<ArgType, Device> m_impl;
 #ifdef EIGEN_USE_SYCL
 // Required by SYCL in order to construct the expression on the device
  XprType m_op;
 #endif
 };
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@ -527,26 +527,32 @@ static void test_execute_generator_op(Device d)
  }
 }
 #ifdef EIGEN_DONT_VECTORIZE
 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
 #else 
 #define VECTORIZABLE(VAL) VAL
 #endif
 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART
 #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, false, ColMajor>(default_device))); \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    true,  false, ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, ColMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    true,  true,  ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  ColMajor>(default_device))); \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, false, RowMajor>(default_device))); \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false, true,  RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    true,  false, RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  false, RowMajor>(default_device))); \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    true,  true,  RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  true,  RowMajor>(default_device))); \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, ColMajor>(tp_device)));      \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  false, ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, ColMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  true,  ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  ColMajor>(tp_device)));      \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, false, RowMajor>(tp_device)));      \
  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, true,  RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  false, RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  false, RowMajor>(tp_device)));      \
-  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, true,  true,  RowMajor>(tp_device)))
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  true,  RowMajor>(tp_device)))
 EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
  Eigen::DefaultDevice default_device;
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@ -51,7 +51,8 @@ static void test_static_reshape() {
  // New dimensions: [2, 3, 7]
  Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
-  Tensor<float, 3> reshaped = tensor.reshape(dim);
+  Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<long,3>>(dim));
  for (int i = 0; i < 2; ++i) {
    for (int j = 0; j < 3; ++j) {